aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--AUTHORS1
-rw-r--r--README.md6
-rw-r--r--docs/supportedsites.md5
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/crunchyroll.py31
-rw-r--r--youtube_dl/extractor/dailymotion.py18
-rw-r--r--youtube_dl/extractor/drtuber.py21
-rw-r--r--youtube_dl/extractor/generic.py16
-rw-r--r--youtube_dl/extractor/hentaistigma.py11
-rw-r--r--youtube_dl/extractor/howcast.py35
-rw-r--r--youtube_dl/extractor/ina.py2
-rw-r--r--youtube_dl/extractor/infoq.py5
-rw-r--r--youtube_dl/extractor/npo.py20
-rw-r--r--youtube_dl/extractor/pbs.py34
-rw-r--r--youtube_dl/extractor/thisamericanlife.py40
-rw-r--r--youtube_dl/extractor/youtube.py101
-rw-r--r--youtube_dl/options.py2
-rw-r--r--youtube_dl/version.py2
18 files changed, 257 insertions, 94 deletions
diff --git a/AUTHORS b/AUTHORS
index 117b9c219..d5418dd37 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -129,3 +129,4 @@ Mister Hat
Peter Ding
jackyzy823
George Brighton
+Remita Amine
diff --git a/README.md b/README.md
index e3452c9e1..93e7fb06f 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like.
--playlist-reverse Download playlist videos in reverse order
--xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental)
--hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental)
- --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,wget
+ --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,httpie,wget
--external-downloader-args ARGS Give these arguments to the external downloader
## Filesystem Options:
@@ -190,8 +190,8 @@ which means you can modify it, redistribute it or use it however you like.
--all-formats Download all available video formats
--prefer-free-formats Prefer free video formats unless a specific one is requested
-F, --list-formats List all available formats
- --youtube-skip-dash-manifest Do not download the DASH manifest on YouTube videos
- --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no
+ --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos
+ --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no
merge is required
## Subtitle Options:
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 9a50fbd1c..687936103 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -283,6 +283,7 @@
- **Motherless**
- **Motorsport**: motorsport.com
- **MovieClips**
+ - **MovieFap**
- **Moviezine**
- **movshare**: MovShare
- **MPORA**
@@ -440,6 +441,8 @@
- **smotri:broadcast**: Smotri.com broadcasts
- **smotri:community**: Smotri.com community videos
- **smotri:user**: Smotri.com user videos
+ - **SnagFilms**
+ - **SnagFilmsEmbed**
- **Snotr**
- **Sohu**
- **soompi**
@@ -502,6 +505,7 @@
- **TheOnion**
- **ThePlatform**
- **TheSixtyOne**
+ - **ThisAmericanLife**
- **ThisAV**
- **THVideo**
- **THVideoPlaylist**
@@ -542,6 +546,7 @@
- **twitch:stream**
- **twitch:video**
- **twitch:vod**
+ - **TwitterCard**
- **Ubu**
- **udemy**
- **udemy:course**
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index d44339200..aba62db53 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -569,6 +569,7 @@ from .tf1 import TF1IE
from .theonion import TheOnionIE
from .theplatform import ThePlatformIE
from .thesixtyone import TheSixtyOneIE
+from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 41f0c736d..73f1e22ef 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -27,7 +27,7 @@ from ..aes import (
class CrunchyrollIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_NETRC_MACHINE = 'crunchyroll'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -46,6 +46,22 @@ class CrunchyrollIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+ 'info_dict': {
+ 'id': '589804',
+ 'ext': 'flv',
+ 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+ 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Danny Choo Network',
+ 'upload_date': '20120213',
+ },
+ 'params': {
+ # rtmp
+ 'skip_download': True,
+ },
+
+ }, {
'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
'only_matching': True,
}]
@@ -251,16 +267,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
- streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
- # urlencode doesn't work!
- streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+ streamdata_req = compat_urllib_request.Request(
+ 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+ % (stream_id, stream_format, stream_quality),
+ compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
streamdata = self._download_xml(
streamdata_req, video_id,
note='Downloading media info for %s' % video_format)
- video_url = streamdata.find('./host').text
- video_play_path = streamdata.find('./file').text
+ stream_info = streamdata.find('./{default}preload/stream_info')
+ video_url = stream_info.find('./host').text
+ video_play_path = stream_info.find('./file').text
formats.append({
'url': video_url,
'play_path': video_play_path,
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 96f0ed9ad..8852f0add 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -254,22 +254,30 @@ class DailymotionUserIE(DailymotionPlaylistIE):
class DailymotionCloudIE(DailymotionBaseInfoExtractor):
- _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)'
+ _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/'
+ _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX
+ _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX
- _TEST = {
+ _TESTS = [{
# From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
# Tested at FranceTvInfo_2
'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
'only_matching': True,
- }
+ }, {
+ # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html
+ 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1',
+ 'only_matching': True,
+ }]
@classmethod
def _extract_dmcloud_url(self, webpage):
- mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+ mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
if mobj:
return mobj.group(1)
- mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+ mobj = re.search(
+ r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+ webpage)
if mobj:
return mobj.group(1)
diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py
index 37c5c181f..639f9182c 100644
--- a/youtube_dl/extractor/drtuber.py
+++ b/youtube_dl/extractor/drtuber.py
@@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor):
r'<source src="([^"]+)"', webpage, 'video URL')
title = self._html_search_regex(
- [r'class="hd_title" style="[^"]+">([^<]+)</h1>', r'<title>([^<]+) - \d+'],
+ [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
webpage, 'title')
thumbnail = self._html_search_regex(
r'poster="([^"]+)"',
webpage, 'thumbnail', fatal=False)
- like_count = str_to_int(self._html_search_regex(
- r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- dislike_count = str_to_int(self._html_search_regex(
- r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- comment_count = str_to_int(self._html_search_regex(
- r'<span class="comments_count">([\d,\.]+)</span>',
- webpage, 'comment count', fatal=False))
+ def extract_count(id_, name):
+ return str_to_int(self._html_search_regex(
+ r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+ webpage, '%s count' % name, fatal=False))
+
+ like_count = extract_count('rate_likes', 'like')
+ dislike_count = extract_count('rate_dislikes', 'dislike')
+ comment_count = extract_count('comments_count', 'comment')
cats_str = self._search_regex(
- r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False)
+ r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
return {
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 32e41d13e..ea60d4a96 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -669,6 +669,18 @@ class GenericIE(InfoExtractor):
'title': 'John Carlson Postgame 2/25/15',
},
},
+ # Kaltura embed (different embed code)
+ {
+ 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+ 'info_dict': {
+ 'id': '1_a52wc67y',
+ 'ext': 'flv',
+ 'upload_date': '20150127',
+ 'uploader_id': 'PremierMedia',
+ 'timestamp': int,
+ 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+ },
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1492,8 +1504,8 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = re.search(
- r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
index 63d87b74c..f5aa73d18 100644
--- a/youtube_dl/extractor/hentaistigma.py
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
- r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+ r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title')
wrap_url = self._html_search_regex(
- r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+ r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)
video_url = self._html_search_regex(
- r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+ r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 3f7d6666c..16677f179 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -1,8 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import parse_iso8601
class HowcastIE(InfoExtractor):
@@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):
'info_dict': {
'id': '390161',
'ext': 'mp4',
- 'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
- }
+ 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+ 'timestamp': 1276081287,
+ 'upload_date': '20100609',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
- video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
- video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
- webpage, 'video URL')
-
- video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
- webpage, 'description', fatal=False)
+ embed_code = self._search_regex(
+ r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+ webpage, 'ooyala embed code')
return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Ooyala',
+ 'url': 'ooyala:%s' % embed_code,
'id': video_id,
- 'url': video_url,
- 'title': self._og_search_title(webpage),
- 'description': video_description,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'timestamp': parse_iso8601(self._html_search_meta(
+ 'article:published_time', webpage, 'timestamp')),
}
diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py
index 0847074ee..65712abc2 100644
--- a/youtube_dl/extractor/ina.py
+++ b/youtube_dl/extractor/ina.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 117a7faf6..91a1b3ccb 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -5,6 +5,7 @@ import base64
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
@@ -45,7 +46,7 @@ class InfoQIE(InfoExtractor):
video_id, extension = video_filename.split('.')
http_base = self._search_regex(
- r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+ r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,
'HTTP base URL')
formats = [{
@@ -55,7 +56,7 @@ class InfoQIE(InfoExtractor):
'play_path': playpath,
}, {
'format_id': 'http',
- 'url': http_base + real_id,
+ 'url': compat_urlparse.urljoin(url, http_base) + real_id,
}]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 5d8448571..62d12b7a6 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -16,8 +16,24 @@ class NPOBaseIE(InfoExtractor):
token_page = self._download_webpage(
'http://ida.omroep.nl/npoplayer/i.js',
video_id, note='Downloading token')
- return self._search_regex(
+ token = self._search_regex(
r'npoplayer\.token = "(.+?)"', token_page, 'token')
+ # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
+ token_l = list(token)
+ first = second = None
+ for i in range(5, len(token_l) - 4):
+ if token_l[i].isdigit():
+ if first is None:
+ first = i
+ elif second is None:
+ second = i
+ if first is None or second is None:
+ first = 12
+ second = 13
+
+ token_l[first], token_l[second] = token_l[second], token_l[first]
+
+ return ''.join(token_l)
class NPOIE(NPOBaseIE):
@@ -92,7 +108,7 @@ class NPOIE(NPOBaseIE):
def _get_info(self, video_id):
metadata = self._download_json(
- 'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+ 'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
# We have to remove the javascript callback
transform_source=strip_jsonp,
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 143a76696..1e2b965f9 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -35,6 +36,9 @@ class PBSIE(InfoExtractor):
'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
'duration': 3190,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
@@ -46,6 +50,9 @@ class PBSIE(InfoExtractor):
'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
'duration': 5050,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
},
{
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -68,7 +75,10 @@ class PBSIE(InfoExtractor):
'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -82,7 +92,10 @@ class PBSIE(InfoExtractor):
'duration': 3172,
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
- }
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -90,6 +103,21 @@ class PBSIE(InfoExtractor):
'id': 'united-states-of-secrets',
},
'playlist_count': 2,
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+ 'info_dict': {
+ 'id': '2280706814',
+ 'display_id': 'player',
+ 'ext': 'mp4',
+ 'title': 'Death and the Civil War',
+ 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+ 'duration': 6705,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}
]
@@ -123,7 +151,7 @@ class PBSIE(InfoExtractor):
return media_id, presumptive_id, upload_date
url = self._search_regex(
- r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
+ r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
webpage, 'player URL')
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py
new file mode 100644
index 000000000..36493a5de
--- /dev/null
+++ b/youtube_dl/extractor/thisamericanlife.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+ 'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+ 'info_dict': {
+ 'id': '487',
+ 'ext': 'm4a',
+ 'title': '487: Harper High School, Part One',
+ 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+ 'protocol': 'm3u8_native',
+ 'ext': 'm4a',
+ 'acodec': 'aac',
+ 'vcodec': 'none',
+ 'abr': 64,
+ 'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+ 'description': self._html_search_meta(r'description', webpage, 'description'),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 8b43e274b..6769a009d 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -520,6 +520,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': 'requires avconv',
}
},
+ # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+ {
+ 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+ 'info_dict': {
+ 'id': 'FIl7x6_3R5Y',
+ 'ext': 'mp4',
+ 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+ 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+ 'upload_date': '20150625',
+ 'uploader_id': 'dorappi2000',
+ 'uploader': 'dorappi2000',
+ 'formats': 'mincount:33',
+ },
+ }
]
def __init__(self, *args, **kwargs):
@@ -826,6 +840,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
except StopIteration:
full_info = self._formats.get(format_id, {}).copy()
full_info.update(f)
+ codecs = r.attrib.get('codecs')
+ if codecs:
+ if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+ full_info['vcodec'] = codecs
+ elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+ full_info['acodec'] = codecs
formats.append(full_info)
else:
existing_format.update(f)
@@ -855,6 +875,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
player_url = None
+ dash_mpds = []
+
+ def add_dash_mpd(video_info):
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd and dash_mpd[0] not in dash_mpds:
+ dash_mpds.append(dash_mpd[0])
+
# Get video info
embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
@@ -875,24 +902,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(video_info)
else:
age_gate = False
- try:
- # Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find ytplayer.config') # caught below
+ video_info = None
+ # Try looking directly into the video webpage
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+ if mobj:
json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code)
args = ytplayer_config['args']
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- if not args.get('url_encoded_fmt_stream_map'):
- raise ValueError('No stream_map present') # caught below
- except ValueError:
- # We fallback to the get_video_info pages (used by the embed page)
+ if args.get('url_encoded_fmt_stream_map'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ # We also try looking in get_video_info since it may contain different dashmpd
+ # URL that points to a DASH manifest with possibly different itag set (some itags
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+ # manifest pointed by get_video_info's dashmpd).
+ # The general idea is to take a union of itags of both DASH manifests (for example
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = (
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (proto, video_id, el_type))
@@ -900,8 +932,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_info_url,
video_id, note=False,
errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
+ get_video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(get_video_info)
+ if not video_info:
+ video_info = get_video_info
+ if 'token' in get_video_info:
break
if 'token' not in video_info:
if 'reason' in video_info:
@@ -964,15 +999,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
# upload date
- upload_date = None
- mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
- if mobj is None:
- mobj = re.search(
- r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
- video_webpage)
- if mobj is not None:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- upload_date = unified_strdate(upload_date)
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+ video_webpage, 'upload date', default=None)
+ if upload_date:
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+ upload_date = unified_strdate(upload_date)
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -1125,24 +1161,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_mpd = video_info.get('dashmpd')
- if dash_mpd:
- dash_manifest_url = dash_mpd[0]
+ for dash_manifest_url in dash_mpds:
+ dash_formats = {}
try:
- dash_formats = self._parse_dash_manifest(
- video_id, dash_manifest_url, player_url, age_gate)
+ for df in self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate):
+ # Do not overwrite DASH format found in some previous DASH manifest
+ if df['format_id'] not in dash_formats:
+ dash_formats[df['format_id']] = df
except (ExtractorError, KeyError) as e:
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
- else:
+ if dash_formats:
# Remove the formats we found through non-DASH, they
# contain less info and it can be wrong, because we use
# fixed values (for example the resolution). See
# https://github.com/rg3/youtube-dl/issues/5774 for an
# example.
- dash_keys = set(df['format_id'] for df in dash_formats)
- formats = [f for f in formats if f['format_id'] not in dash_keys]
- formats.extend(dash_formats)
+ formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+ formats.extend(dash_formats.values())
# Check for malformed aspect ratio
stretched_m = re.search(
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index e7d067642..4762e1e3c 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -346,7 +346,7 @@ def parseOpts(overrideArguments=None):
video_format.add_option(
'--youtube-skip-dash-manifest',
action='store_false', dest='youtube_include_dash_manifest',
- help='Do not download the DASH manifest on YouTube videos')
+ help='Do not download the DASH manifests and related data on YouTube videos')
video_format.add_option(
'--merge-output-format',
action='store', dest='merge_output_format', metavar='FORMAT', default=None,
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index a225e03a1..eff4aebeb 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.06.25'
+__version__ = '2015.07.04'