aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md23
-rw-r--r--test/helper.py29
-rw-r--r--test/test_utils.py34
-rw-r--r--test/test_youtube_signature.py12
-rw-r--r--youtube_dl/extractor/__init__.py7
-rw-r--r--youtube_dl/extractor/arte.py6
-rw-r--r--youtube_dl/extractor/br.py5
-rw-r--r--youtube_dl/extractor/cliphunter.py4
-rw-r--r--youtube_dl/extractor/common.py6
-rw-r--r--youtube_dl/extractor/dropbox.py5
-rw-r--r--youtube_dl/extractor/eporner.py4
-rw-r--r--youtube_dl/extractor/facebook.py2
-rw-r--r--youtube_dl/extractor/funnyordie.py2
-rw-r--r--youtube_dl/extractor/generic.py34
-rw-r--r--youtube_dl/extractor/godtube.py12
-rw-r--r--youtube_dl/extractor/golem.py6
-rw-r--r--youtube_dl/extractor/ign.py7
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py19
-rw-r--r--youtube_dl/extractor/izlesene.py3
-rw-r--r--youtube_dl/extractor/jukebox.py6
-rw-r--r--youtube_dl/extractor/lrt.py69
-rw-r--r--youtube_dl/extractor/nfl.py161
-rw-r--r--youtube_dl/extractor/pbs.py39
-rw-r--r--youtube_dl/extractor/prosiebensat1.py18
-rw-r--r--youtube_dl/extractor/spankwire.py48
-rw-r--r--youtube_dl/extractor/sportdeutschland.py4
-rw-r--r--youtube_dl/extractor/sunporno.py4
-rw-r--r--youtube_dl/extractor/tapely.py104
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/thvideo.py29
-rw-r--r--youtube_dl/extractor/tvigle.py17
-rw-r--r--youtube_dl/extractor/vgtv.py4
-rw-r--r--youtube_dl/extractor/vimeo.py99
-rw-r--r--youtube_dl/extractor/vuclip.py34
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py22
-rw-r--r--youtube_dl/extractor/yahoo.py10
-rw-r--r--youtube_dl/extractor/ynet.py8
-rw-r--r--youtube_dl/extractor/youtube.py14
-rw-r--r--youtube_dl/options.py2
-rw-r--r--youtube_dl/utils.py83
-rw-r--r--youtube_dl/version.py2
41 files changed, 696 insertions, 303 deletions
diff --git a/README.md b/README.md
index 0f7442906..cabc5eb9a 100644
--- a/README.md
+++ b/README.md
@@ -348,21 +348,34 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231
# FAQ
-### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
+### How do I update youtube-dl?
-YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
+If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
+
+If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update.
-If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version.
+If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
-Alternatively, uninstall the youtube-dl package and follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html). In a pinch, this should do if you used `apt-get` before to install youtube-dl:
+As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like
+
+ sudo apt-get remove -y youtube-dl
+
+Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html):
```
-sudo apt-get remove -y youtube-dl
sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl
sudo chmod a+x /usr/local/bin/youtube-dl
hash -r
```
+Again, from then on you'll be able to update with `sudo youtube-dl -U`.
+
+### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
+
+YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
+
+If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version. See above for a way to update.
+
### Do I always have to pass in `--max-quality FORMAT`, or `-citw`?
By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`.
diff --git a/test/helper.py b/test/helper.py
index 7f3ab8438..62cb3ce02 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import errno
import io
import hashlib
@@ -12,6 +14,7 @@ from youtube_dl import YoutubeDL
from youtube_dl.utils import (
compat_str,
preferredencoding,
+ write_string,
)
@@ -40,10 +43,10 @@ def report_warning(message):
If stderr is a tty file the 'WARNING:' will be colored
'''
if sys.stderr.isatty() and os.name != 'nt':
- _msg_header = u'\033[0;33mWARNING:\033[0m'
+ _msg_header = '\033[0;33mWARNING:\033[0m'
else:
- _msg_header = u'WARNING:'
- output = u'%s %s\n' % (_msg_header, message)
+ _msg_header = 'WARNING:'
+ output = '%s %s\n' % (_msg_header, message)
if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3:
output = output.encode(preferredencoding())
sys.stderr.write(output)
@@ -103,22 +106,22 @@ def expect_info_dict(self, expected_dict, got_dict):
self.assertTrue(
isinstance(got, compat_str),
- u'Expected a %s object, but got %s for field %s' % (
+ 'Expected a %s object, but got %s for field %s' % (
compat_str.__name__, type(got).__name__, info_field))
self.assertTrue(
match_rex.match(got),
- u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+ 'field %s (value: %r) should match %r' % (info_field, got, match_str))
elif isinstance(expected, type):
got = got_dict.get(info_field)
self.assertTrue(isinstance(got, expected),
- u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
+ 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(got_dict.get(info_field))
else:
got = got_dict.get(info_field)
self.assertEqual(expected, got,
- u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+ 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# Check for the presence of mandatory fields
if got_dict.get('_type') != 'playlist':
@@ -126,7 +129,7 @@ def expect_info_dict(self, expected_dict, got_dict):
self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
# Check for mandatory fields that are automatically set by YoutubeDL
for key in ['webpage_url', 'extractor', 'extractor_key']:
- self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
+ self.assertTrue(got_dict.get(key), 'Missing field: %s' % key)
# Are checkable fields missing from the test case definition?
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
@@ -134,7 +137,15 @@ def expect_info_dict(self, expected_dict, got_dict):
if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
if missing_keys:
- sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
+ def _repr(v):
+ if isinstance(v, compat_str):
+ return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'")
+ else:
+ return repr(v)
+ info_dict_str = ''.join(
+ ' %s: %s,\n' % (_repr(k), _repr(v))
+ for k, v in test_info_dict.items())
+ write_string('\n"info_dict": {' + info_dict_str + '}\n', out=sys.stderr)
self.assertFalse(
missing_keys,
'Missing keys in test definition: %s' % (
diff --git a/test/test_utils.py b/test/test_utils.py
index 3efbed29d..bcca0efea 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -22,7 +22,8 @@ from youtube_dl.utils import (
fix_xml_ampersands,
get_meta_content,
orderedSet,
- PagedList,
+ OnDemandPagedList,
+ InAdvancePagedList,
parse_duration,
read_batch_urls,
sanitize_filename,
@@ -43,6 +44,7 @@ from youtube_dl.utils import (
limit_length,
escape_rfc3986,
escape_url,
+ js_to_json,
)
@@ -137,6 +139,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
+ self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
def test_find_xpath_attr(self):
testxml = '''<root>
@@ -246,10 +249,14 @@ class TestUtil(unittest.TestCase):
for i in range(firstid, upto):
yield i
- pl = PagedList(get_page, pagesize)
+ pl = OnDemandPagedList(get_page, pagesize)
got = pl.getslice(*sliceargs)
self.assertEqual(got, expected)
+ iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
+ got = iapl.getslice(*sliceargs)
+ self.assertEqual(got, expected)
+
testPL(5, 2, (), [0, 1, 2, 3, 4])
testPL(5, 2, (1,), [1, 2, 3, 4])
testPL(5, 2, (2,), [2, 3, 4])
@@ -325,5 +332,28 @@ class TestUtil(unittest.TestCase):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+ def test_js_to_json_realworld(self):
+ inp = '''{
+ 'clip':{'provider':'pseudo'}
+ }'''
+ self.assertEqual(js_to_json(inp), '''{
+ "clip":{"provider":"pseudo"}
+ }''')
+ json.loads(js_to_json(inp))
+
+ inp = '''{
+ 'playlist':[{'controls':{'all':null}}]
+ }'''
+ self.assertEqual(js_to_json(inp), '''{
+ "playlist":[{"controls":{"all":null}}]
+ }''')
+
+ def test_js_to_json_edgecases(self):
+ on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
+ self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
+
+ on = js_to_json('{"abc": true}')
+ self.assertEqual(json.loads(on), {'abc': True})
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index 604e76ab6..df2cb09f2 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -48,18 +48,6 @@ _TESTS = [
'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2',
),
(
- 'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf',
- 'swf',
- 86,
- 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?'
- ),
- (
- 'http://s.ytimg.com/yts/swfbin/player-vflmDyk47/watch_as3.swf',
- 'swf',
- 'F375F75BF2AFDAAF2666E43868D46816F83F13E81C46.3725A8218E446A0DECD33F79DC282994D6AA92C92C9',
- '9C29AA6D499282CD97F33DCED0A644E8128A5273.64C18E31F38361864D86834E6662FAADFA2FB57F'
- ),
- (
'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js',
'js',
84,
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 86bff185b..e51ea701f 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -190,6 +190,7 @@ from .livestream import (
LivestreamOriginalIE,
LivestreamShortenerIE,
)
+from .lrt import LRTIE
from .lynda import (
LyndaIE,
LyndaCourseIE
@@ -354,6 +355,7 @@ from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE
+from .tapely import TapelyIE
from .teachertube import (
TeacherTubeIE,
TeacherTubeUserIE,
@@ -371,7 +373,10 @@ from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
from .tnaflix import TNAFlixIE
-from .thvideo import THVideoIE
+from .thvideo import (
+ THVideoIE,
+ THVideoPlaylistIE
+)
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 957d35979..c3d02f85e 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
+ upload_date_str = player_info.get('shootingDate')
+ if not upload_date_str:
+ upload_date_str = player_info.get('VDA', '').split(' ')[0]
+
info_dict = {
'id': player_info['VID'],
'title': player_info['VTI'],
'description': player_info.get('VDE'),
- 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
+ 'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 4e2960c62..2e277c8c3 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -26,6 +26,8 @@ class BRIE(InfoExtractor):
'title': 'Wenn das Traditions-Theater wackelt',
'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
'duration': 34,
+ 'uploader': 'BR',
+ 'upload_date': '20140802',
}
},
{
@@ -66,8 +68,7 @@ class BRIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
+ display_id = self._match_id(url)
page = self._download_webpage(url, display_id)
xml_url = self._search_regex(
r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py
index 65c12136a..d4227e6eb 100644
--- a/youtube_dl/extractor/cliphunter.py
+++ b/youtube_dl/extractor/cliphunter.py
@@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor):
'title': 'Fun Jynx Maze solo',
'thumbnail': 're:^https?://.*\.jpg$',
'age_limit': 18,
- 'duration': 1317,
}
}
@@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor):
thumbnail = self._search_regex(
r"var\s+mov_thumb\s*=\s*'([^']+)';",
webpage, 'thumbnail', fatal=False)
- duration = int_or_none(self._search_regex(
- r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False))
return {
'id': video_id,
'title': video_title,
'formats': formats,
- 'duration': duration,
'age_limit': self._rta_search(webpage),
'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index f43a0a569..611cf95f1 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -334,7 +334,11 @@ class InfoExtractor(object):
try:
return json.loads(json_string)
except ValueError as ve:
- raise ExtractorError('Failed to download JSON', cause=ve)
+ errmsg = '%s: Failed to parse JSON ' % video_id
+ if fatal:
+ raise ExtractorError(errmsg, cause=ve)
+ else:
+ self.report_warning(errmsg + str(ve))
def report_warning(self, msg, video_id=None):
idstr = '' if video_id is None else '%s: ' % video_id
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py
index 817a9bd61..5f24ac721 100644
--- a/youtube_dl/extractor/dropbox.py
+++ b/youtube_dl/extractor/dropbox.py
@@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor):
video_id = mobj.group('id')
fn = compat_urllib_parse_unquote(url_basename(url))
title = os.path.splitext(fn)[0]
- video_url = (
- re.sub(r'[?&]dl=0', '', url) +
- ('?' if '?' in url else '&') + 'dl=1')
+ video_url = re.sub(r'[?&]dl=0', '', url)
+ video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
return {
'id': video_id,
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py
index 522aa3d63..bb231ecb1 100644
--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
_TEST = {
'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
- 'md5': '3b427ae4b9d60619106de3185c2987cd',
+ 'md5': '39d486f046212d8e1b911c52ab4691f8',
'info_dict': {
'id': '95008',
'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Infamous Tiffany Teen Strip Tease Video',
'duration': 194,
'view_count': int,
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 60e68d98a..3ad993751 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor):
'id': '637842556329505',
'ext': 'mp4',
'duration': 38,
- 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...',
+ 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
}
}, {
'note': 'Video without discernible title',
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 721e5fce0..d966e8403 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):
},
}, {
'url': 'http://www.funnyordie.com/embed/e402820827',
- 'md5': 'ff4d83318f89776ed0250634cfaa8d36',
+ 'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad',
'info_dict': {
'id': 'e402820827',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 0dfa4853d..14c024e48 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -155,7 +155,6 @@ class GenericIE(InfoExtractor):
# funnyordie embed
{
'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
- 'md5': '7cf780be104d40fea7bae52eed4a470e',
'info_dict': {
'id': '18e820ec3f',
'ext': 'mp4',
@@ -180,13 +179,13 @@ class GenericIE(InfoExtractor):
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
- 'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+ 'md5': '65fdff94098e4a607385a60c5177c638',
'info_dict': {
- 'id': '981',
+ 'id': '1969',
'ext': 'mp4',
- 'title': 'My web playroom',
- 'uploader': 'Ze Frank',
- 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
+ 'title': 'Hidden miracles of the natural world',
+ 'uploader': 'Louie Schwartzberg',
+ 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
}
},
# Embeded Ustream video
@@ -226,21 +225,6 @@ class GenericIE(InfoExtractor):
'skip_download': 'Requires rtmpdump'
}
},
- # smotri embed
- {
- 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
- 'md5': 'ec40048448e9284c9a1de77bb188108b',
- 'info_dict': {
- 'id': 'v27008541fad',
- 'ext': 'mp4',
- 'title': 'Крым и Севастополь вошли в состав России',
- 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
- 'duration': 900,
- 'upload_date': '20140318',
- 'uploader': 'rbctv_2012_4',
- 'uploader_id': 'rbctv_2012_4',
- },
- },
# Condé Nast embed
{
'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -295,13 +279,13 @@ class GenericIE(InfoExtractor):
{
'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
'info_dict': {
- 'id': 'jpSGZsgga_I',
+ 'id': '4vAffPZIT44',
'ext': 'mp4',
- 'title': 'Asphalt 8: Airborne - Launch Trailer',
+ 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
'uploader': 'Gameloft',
'uploader_id': 'gameloft',
- 'upload_date': '20130821',
- 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a',
+ 'upload_date': '20140828',
+ 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
},
'params': {
'skip_download': True,
diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py
index 73bd6d890..363dc6608 100644
--- a/youtube_dl/extractor/godtube.py
+++ b/youtube_dl/extractor/godtube.py
@@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor):
'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(),
video_id, 'Downloading player config XML')
- video_url = config.find('.//file').text
- uploader = config.find('.//author').text
- timestamp = parse_iso8601(config.find('.//date').text)
- duration = parse_duration(config.find('.//duration').text)
- thumbnail = config.find('.//image').text
+ video_url = config.find('file').text
+ uploader = config.find('author').text
+ timestamp = parse_iso8601(config.find('date').text)
+ duration = parse_duration(config.find('duration').text)
+ thumbnail = config.find('image').text
media = self._download_xml(
'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML')
- title = media.find('.//title').text
+ title = media.find('title').text
return {
'id': video_id,
diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py
index bebfe8568..53714f47f 100644
--- a/youtube_dl/extractor/golem.py
+++ b/youtube_dl/extractor/golem.py
@@ -38,11 +38,9 @@ class GolemIE(InfoExtractor):
}
formats = []
- for e in config.findall('./*[url]'):
+ for e in config:
url = e.findtext('./url')
if not url:
- self._downloader.report_warning(
- "{0}: url: empty, skipping".format(e.tag))
continue
formats.append({
@@ -57,7 +55,7 @@ class GolemIE(InfoExtractor):
info['formats'] = formats
thumbnails = []
- for e in config.findall('.//teaser[url]'):
+ for e in config.findall('.//teaser'):
url = e.findtext('./url')
if not url:
continue
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 12e9e61c4..c80185b53 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -89,7 +89,12 @@ class IGNIE(InfoExtractor):
'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
webpage)
if multiple_urls:
- return [self.url_result(u, ie='IGN') for u in multiple_urls]
+ entries = [self.url_result(u, ie='IGN') for u in multiple_urls]
+ return {
+ '_type': 'playlist',
+ 'id': name_or_id,
+ 'entries': entries,
+ }
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 4ddda2f1b..53f9a5f75 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor):
_VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
_TEST = {
- u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
- u'file': u'452693.mp4',
- u'info_dict': {
- u'title': u'SKYFALL',
- u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
- u'duration': 153,
+ 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+ 'info_dict': {
+ 'id': '452693',
+ 'ext': 'mp4',
+ 'title': 'SKYFALL',
+ 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
+ 'duration': 149,
},
}
@@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor):
url = self._build_url(query)
flashconfiguration = self._download_xml(url, video_id,
- u'Downloading flash configuration')
+ 'Downloading flash configuration')
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
@@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor):
lambda m: self._clean_query(m.group()),
file_url)
info = self._download_xml(file_url, video_id,
- u'Downloading video info')
+ 'Downloading video info')
item = info.find('channel/item')
def _bp(p):
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index a83dd249f..07ef682ee 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ thumbnail = self._proto_relative_url(
+ self._og_search_thumbnail(webpage), scheme='http:')
uploader = self._html_search_regex(
r"adduserUsername\s*=\s*'([^']+)';",
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
index 9b553b9fa..5aa32bf09 100644
--- a/youtube_dl/extractor/jukebox.py
+++ b/youtube_dl/extractor/jukebox.py
@@ -11,10 +11,9 @@ from ..utils import (
class JukeboxIE(InfoExtractor):
- _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
+ _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
_TEST = {
'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
- 'md5': '1574e9b4d6438446d5b7dbcdf2786276',
'info_dict': {
'id': 'r303r',
'ext': 'flv',
@@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
+ video_id = self._match_id(url)
html = self._download_webpage(url, video_id)
iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
new file mode 100644
index 000000000..fca0bfef0
--- /dev/null
+++ b/youtube_dl/extractor/lrt.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ js_to_json,
+ parse_duration,
+ remove_end,
+)
+
+
+class LRTIE(InfoExtractor):
+ IE_NAME = 'lrt.lt'
+ _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
+ 'info_dict': {
+ 'id': '54391',
+ 'ext': 'mp4',
+ 'title': 'Septynios Kauno dienos',
+ 'description': 'Kauno miesto ir apskrities naujienos',
+ 'duration': 1783,
+ },
+ 'params': {
+ 'skip_download': True, # HLS download
+ },
+
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = remove_end(self._og_search_title(webpage), ' - LRT')
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+ duration = parse_duration(self._search_regex(
+ r"'duration':\s*'([^']+)',", webpage,
+ 'duration', fatal=False, default=None))
+
+ formats = []
+ for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage):
+ data = json.loads(js_to_json(js))
+ if data['provider'] == 'rtmp':
+ formats.append({
+ 'format_id': 'rtmp',
+ 'ext': determine_ext(data['file']),
+ 'url': data['streamer'],
+ 'play_path': 'mp4:%s' % data['file'],
+ 'preference': -1,
+ })
+ else:
+ formats.extend(
+ self._extract_m3u8_formats(data['file'], video_id, 'mp4'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index 963c4587c..cc7c921c3 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ compat_urllib_parse_urlparse,
int_or_none,
remove_end,
)
@@ -13,76 +14,116 @@ from ..utils import (
class NFLIE(InfoExtractor):
IE_NAME = 'nfl.com'
- _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
- _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
- _TEST = {
- 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
- # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates
- 'info_dict': {
- 'id': '0ap3000000398478',
- 'ext': 'mp4',
- 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
- 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
- 'upload_date': '20140921',
- 'timestamp': 1411337580,
- 'thumbnail': 're:^https?://.*\.jpg$',
+ _VALID_URL = r'''(?x)https?://
+ (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
+ (?:.+?/)*
+ (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+ _TESTS = [
+ {
+ 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+ 'md5': '394ef771ddcd1354f665b471d78ec4c6',
+ 'info_dict': {
+ 'id': '0ap3000000398478',
+ 'ext': 'mp4',
+ 'title': 'Week 3: Redskins vs. Eagles highlights',
+ 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+ 'upload_date': '20140921',
+ 'timestamp': 1411337580,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ },
+ {
+ 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+ 'info_dict': {
+ 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'ext': 'mp4',
+ 'title': 'LIVE: Post Game vs. Browns',
+ 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+ 'upload_date': '20131229',
+ 'timestamp': 1388354455,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+ ]
+
+ @staticmethod
+ def prepend_host(host, url):
+ if not url.startswith('http'):
+ if not url.startswith('/'):
+ url = '/%s' % url
+ url = 'http://{0:}{1:}'.format(host, url)
+ return url
+
+ @staticmethod
+ def format_from_stream(stream, protocol, host, path_prefix='',
+ preference=0, note=None):
+ url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+ protocol=protocol,
+ host=host,
+ prefix=path_prefix,
+ path=stream.get('path'),
+ )
+ return {
+ 'url': url,
+ 'vbr': int_or_none(stream.get('rate', 0), 1000),
+ 'preference': preference,
+ 'format_note': note,
}
- }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id, host = mobj.group('id'), mobj.group('host')
- config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
- note='Downloading player config')
- url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
- video_data = self._download_json(url_template.format(id=video_id), video_id)
+ webpage = self._download_webpage(url, video_id)
- cdns = config.get('cdns')
- if not cdns:
- raise ExtractorError('Failed to get CDN data', expected=True)
+ config_url = NFLIE.prepend_host(host, self._search_regex(
+ r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
+ config = self._download_json(config_url, video_id,
+ note='Downloading player config')
+ url_template = NFLIE.prepend_host(
+ host, '{contentURLTemplate:}'.format(**config))
+ video_data = self._download_json(
+ url_template.format(id=video_id), video_id)
formats = []
- streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
- for name, cdn in cdns.items():
- # LimeLight streams don't seem to work
- if cdn.get('name') == 'LIMELIGHT':
- continue
-
- protocol = cdn.get('protocol')
- host = remove_end(cdn.get('host', ''), '/')
- if not (protocol and host):
- continue
-
- path_prefix = cdn.get('pathprefix', '')
- if path_prefix and not path_prefix.endswith('/'):
- path_prefix = '%s/' % path_prefix
-
- get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
- protocol=protocol,
- host=host,
- prefix=path_prefix,
- path=p,
- )
-
- if protocol == 'rtmp':
- preference = -2
- elif 'prog' in name.lower():
- preference = -1
- else:
- preference = 0
-
+ cdn_data = video_data.get('cdnData', {})
+ streams = cdn_data.get('bitrateInfo', [])
+ if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+ parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))
+ protocol, host = parts.scheme, parts.netloc
for stream in streams:
- path = stream.get('path')
- if not path:
+ formats.append(
+ NFLIE.format_from_stream(stream, protocol, host))
+ else:
+ cdns = config.get('cdns')
+ if not cdns:
+ raise ExtractorError('Failed to get CDN data', expected=True)
+
+ for name, cdn in cdns.items():
+ # LimeLight streams don't seem to work
+ if cdn.get('name') == 'LIMELIGHT':
continue
- formats.append({
- 'url': get_url(path),
- 'vbr': int_or_none(stream.get('rate', 0), 1000),
- 'preference': preference,
- 'format_note': name,
- })
+ protocol = cdn.get('protocol')
+ host = remove_end(cdn.get('host', ''), '/')
+ if not (protocol and host):
+ continue
+
+ prefix = cdn.get('pathprefix', '')
+ if prefix and not prefix.endswith('/'):
+ prefix = '%s/' % prefix
+
+ preference = 0
+ if protocol == 'rtmp':
+ preference = -2
+ elif 'prog' in name.lower():
+ preference = 1
+
+ for stream in streams:
+ formats.append(
+ NFLIE.format_from_stream(stream, protocol, host,
+ prefix, preference, name))
self._sort_formats(formats)
@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):
return {
'id': video_id,
- 'title': video_data.get('storyHeadline'),
+ 'title': video_data.get('headline'),
'formats': formats,
'description': video_data.get('caption'),
'duration': video_data.get('duration'),
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 2adfde909..8f140d626 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ unified_strdate,
US_RATINGS,
)
@@ -11,10 +12,10 @@ from ..utils import (
class PBSIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:
- # Direct video URL
- video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
- # Article with embedded player
- (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
+ # Direct video URL
+ video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+ # Article with embedded player (or direct video)
+ (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
@@ -65,10 +66,25 @@ class PBSIE(InfoExtractor):
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
}
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
+ 'md5': '908f3e5473a693b266b84e25e1cf9703',
+ 'info_dict': {
+ 'id': '2365160389',
+ 'display_id': 'killer-typhoon',
+ 'ext': 'mp4',
+ 'description': 'md5:c741d14e979fc53228c575894094f157',
+ 'title': 'Killer Typhoon',
+ 'duration': 3172,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140122',
+ }
}
+
]
- def _extract_ids(self, url):
+ def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
presumptive_id = mobj.group('presumptive_id')
@@ -76,15 +92,20 @@ class PBSIE(InfoExtractor):
if presumptive_id:
webpage = self._download_webpage(url, display_id)
+ upload_date = unified_strdate(self._search_regex(
+ r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
+ webpage, 'upload date', default=None))
+
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
+ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
]
media_id = self._search_regex(
MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
- return media_id, presumptive_id
+ return media_id, presumptive_id, upload_date
url = self._search_regex(
r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
@@ -104,10 +125,10 @@ class PBSIE(InfoExtractor):
video_id = mobj.group('id')
display_id = video_id
- return video_id, display_id
+ return video_id, display_id, None
def _real_extract(self, url):
- video_id, display_id = self._extract_ids(url)
+ video_id, display_id, upload_date = self._extract_webpage(url)
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
@@ -119,6 +140,7 @@ class PBSIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'title': info['title'],
'url': info['alternate_encoding']['url'],
'ext': 'mp4',
@@ -126,4 +148,5 @@ class PBSIE(InfoExtractor):
'thumbnail': info.get('image_url'),
'duration': info.get('duration'),
'age_limit': age_limit,
+ 'upload_date': upload_date,
}
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 5b2a723c1..619496de7 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor):
'id': '2156342',
'ext': 'mp4',
'title': 'Kurztrips zum Valentinstag',
- 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+ 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
'duration': 307.24,
},
'params': {
@@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- page = self._download_webpage(url, video_id, 'Downloading page')
-
- clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id')
+ clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
access_token = 'testclient'
client_name = 'kolibri-1.2.5'
@@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor):
urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
- title = self._html_search_regex(self._TITLE_REGEXES, page, 'title')
- description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(page)
+ title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
+ description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
upload_date = unified_strdate(self._html_search_regex(
- self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None))
+ self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
formats = []
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 2007a0013..94602e89e 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -9,7 +9,6 @@ from ..utils import (
compat_urllib_parse,
unified_strdate,
str_to_int,
- int_or_none,
)
from ..aes import aes_decrypt_text
@@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
+ title = self._html_search_regex(
+ r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
- r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
+ r'<div\s+id="descriptionContent">([^<]+)<',
+ webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
- r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+ r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
+ webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex(
- r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
+ r'by:\s*<a [^>]*>(.+?)</a>',
+ webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex(
- r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
- upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
- if upload_date:
- upload_date = unified_strdate(upload_date)
-
- view_count = self._html_search_regex(
- r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
- if view_count:
- view_count = str_to_int(view_count)
- comment_count = int_or_none(self._html_search_regex(
- r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
+ r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"',
+ webpage, 'uploader id', fatal=False)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'</a> on (.+?) at \d+:\d+',
+ webpage, 'upload date', fatal=False))
- video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
+ view_count = str_to_int(self._html_search_regex(
+ r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
+ webpage, 'view count', fatal=False))
+ comment_count = str_to_int(self._html_search_regex(
+ r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+ webpage, 'comment count', fatal=False))
+
+ video_urls = list(map(
+ compat_urllib_parse.unquote,
+ re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
- password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ')
- video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
+ password = self._html_search_regex(
+ r'flashvars\.video_title = "([^"]+)',
+ webpage, 'password').replace('+', ' ')
+ video_urls = list(map(
+ lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'),
+ video_urls))
formats = []
for video_url in video_urls:
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index 185353bef..abb827783 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor):
'info_dict': {
'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
'ext': 'mp4',
- 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
+ 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
'categories': ['Badminton'],
'view_count': int,
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE',
+ 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV',
'timestamp': int,
'upload_date': 're:^201408[23][0-9]$',
},
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 7de3c9dd5..263f09b46 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor):
r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._search_regex(
- r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False))
+ r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False))
+ r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
new file mode 100644
index 000000000..77e056242
--- /dev/null
+++ b/youtube_dl/extractor/tapely.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ compat_urllib_request,
+ float_or_none,
+ parse_iso8601,
+)
+
+
+class TapelyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
+ _API_URL = 'http://tape.ly/showtape?id={0:}'
+ _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
+ _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
+ _TESTS = [
+ {
+ 'url': 'http://tape.ly/my-grief-as-told-by-water',
+ 'info_dict': {
+ 'id': 23952,
+ 'title': 'my grief as told by water',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'uploader_id': 16484,
+ 'timestamp': 1411848286,
+ 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.',
+ },
+ 'playlist_count': 13,
+ },
+ {
+ 'url': 'http://tape.ly/my-grief-as-told-by-water/1',
+ 'md5': '79031f459fdec6530663b854cbc5715c',
+ 'info_dict': {
+ 'id': 258464,
+ 'title': 'Dreaming Awake (My Brightest Diamond)',
+ 'ext': 'm4a',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ playlist_url = self._API_URL.format(display_id)
+ request = compat_urllib_request.Request(playlist_url)
+ request.add_header('X-Requested-With', 'XMLHttpRequest')
+ request.add_header('Accept', 'application/json')
+
+ playlist = self._download_json(request, display_id)
+
+ tape = playlist['tape']
+
+ entries = []
+ for s in tape['songs']:
+ song = s['song']
+ entry = {
+ 'id': song['id'],
+ 'duration': float_or_none(song.get('songduration'), 1000),
+ 'title': song['title'],
+ }
+ if song['source'] == 'S3':
+ entry.update({
+ 'url': self._S3_SONG_URL.format(song['filename']),
+ })
+ entries.append(entry)
+ elif song['source'] == 'YT':
+ self.to_screen('YouTube video detected')
+ yt_id = song['filename'].replace('/youtube/', '')
+ entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id))
+ entries.append(entry)
+ elif song['source'] == 'SC':
+ self.to_screen('SoundCloud song detected')
+ sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename'])
+ entry.update(self.url_result(sc_url, 'Soundcloud'))
+ entries.append(entry)
+ else:
+ self.report_warning('Unknown song source: %s' % song['source'])
+
+ if mobj.group('songnr'):
+ songnr = int(mobj.group('songnr')) - 1
+ try:
+ return entries[songnr]
+ except IndexError:
+ raise ExtractorError(
+ 'No song with index: %s' % mobj.group('songnr'),
+ expected=True)
+
+ return {
+ '_type': 'playlist',
+ 'id': tape['id'],
+ 'display_id': display_id,
+ 'title': tape['name'],
+ 'entries': entries,
+ 'thumbnail': tape.get('image_url'),
+ 'description': clean_html(tape.get('subtext')),
+ 'like_count': tape.get('likescount'),
+ 'uploader_id': tape.get('user_id'),
+ 'timestamp': parse_iso8601(tape.get('published_at')),
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 1cca47771..d5e28efad 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor):
thumbnail = 'http://' + thumbnail
return {
'id': video_id,
- 'title': talk_info['title'],
+ 'title': talk_info['title'].strip(),
'uploader': talk_info['speaker'],
'thumbnail': thumbnail,
'description': self._og_search_description(webpage),
diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py
index 607e947bb..496f15d80 100644
--- a/youtube_dl/extractor/thvideo.py
+++ b/youtube_dl/extractor/thvideo.py
@@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
# extract download link from mobile player page
webpage_player = self._download_webpage(
@@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):
'description': description,
'upload_date': upload_date
}
+
+
+class THVideoPlaylistIE(InfoExtractor):
+ _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://thvideo.tv/mylist2',
+ 'info_dict': {
+ 'id': '2',
+ 'title': '幻想万華鏡',
+ },
+ 'playlist_mincount': 23,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+ list_title = self._html_search_regex(
+ r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title',
+ fatal=False)
+
+ entries = [
+ self.url_result('http://thvideo.tv/v/th' + id, 'THVideo')
+ for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)]
+
+ return self.playlist_result(entries, playlist_id, list_title)
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index dc8697850..27962b5fe 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.tvigle.ru/video/brat-2/',
- 'md5': '72cb7eab33e54314e1790da402d3c9c3',
+ 'url': 'http://www.tvigle.ru/video/brat/',
+ 'md5': 'ff4344a4894b0524441fb6f8218dc716',
'info_dict': {
- 'id': '5119390',
- 'display_id': 'brat-2',
+ 'id': '5118490',
+ 'display_id': 'brat',
'ext': 'mp4',
- 'title': 'Брат 2 ',
- 'description': 'md5:5751f4fe345a58e1692585c361294bd8',
- 'duration': 7356.369,
- 'age_limit': 0,
+ 'title': 'Брат',
+ 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb',
+ 'duration': 5722.6,
+ 'age_limit': 16,
},
},
{
@@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor):
'format_id': '%s-%s' % (vcodec, quality),
'vcodec': vcodec,
'height': int(quality[:-1]),
+ 'filesize': item['video_files_size'][vcodec][quality],
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 7d27d6c57..964470070 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor):
'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen',
'info_dict': {
'id': '100764',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',
'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',
'thumbnail': 're:^https?://.*\.jpg',
@@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor):
'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
'info_dict': {
'id': '100015',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
'thumbnail': 're:^https?://.*\.jpg',
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 4be1b8785..d2c36b58a 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -8,18 +8,19 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
+ clean_html,
compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
- clean_html,
- get_element_by_attribute,
+ compat_urlparse,
ExtractorError,
+ get_element_by_attribute,
+ InAdvancePagedList,
+ int_or_none,
RegexNotFoundError,
- smuggle_url,
std_headers,
unsmuggle_url,
urlencode_postdata,
- int_or_none,
)
@@ -90,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+ 'description': 'md5:380943ec71b89736ff4bf27183233d09',
'duration': 1595,
},
},
@@ -104,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader': 'The BLN & Business of Software',
'uploader_id': 'theblnbusinessofsoftware',
'duration': 3610,
+ 'description': None,
},
},
{
@@ -118,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
+ 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',
},
'params': {
'videopassword': 'youtube-dl',
@@ -204,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ orig_url = url
if mobj.group('pro') or mobj.group('player'):
url = 'http://player.vimeo.com/video/' + video_id
@@ -274,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
_, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
# Extract video description
- video_description = None
- try:
- video_description = get_element_by_attribute("class", "description_wrapper", webpage)
- if video_description:
- video_description = clean_html(video_description)
- except AssertionError as err:
- # On some pages like (http://player.vimeo.com/video/54469442) the
- # html tags are not closed, python 2.6 cannot handle it
- if err.args[0] == 'we should not get here!':
- pass
- else:
- raise
+
+ video_description = self._html_search_regex(
+ r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+ webpage, 'description', default=None)
+ if not video_description:
+ video_description = self._html_search_meta(
+ 'description', webpage, default=None)
+ if not video_description and mobj.group('pro'):
+ orig_webpage = self._download_webpage(
+ orig_url, video_id,
+ note='Downloading webpage for description',
+ fatal=False)
+ if orig_webpage:
+ video_description = self._html_search_meta(
+ 'description', orig_webpage, default=None)
+ if not video_description and not mobj.group('player'):
+ self._downloader.report_warning('Cannot find video description')
# Extract video duration
video_duration = int_or_none(config["video"].get("duration"))
@@ -533,32 +543,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
class VimeoLikesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
IE_NAME = 'vimeo:likes'
IE_DESC = 'Vimeo user likes'
_TEST = {
- 'url': 'https://vimeo.com/user20132939/likes',
- 'playlist_mincount': 4,
- 'add_ies': ['Generic'],
+ 'url': 'https://vimeo.com/user755559/likes/',
+ 'playlist_mincount': 293,
"info_dict": {
- "description": "Videos Philipp Hagemeister likes on Vimeo.",
- "title": "Vimeo / Philipp Hagemeister's likes",
- },
- 'params': {
- 'extract_flat': False,
+ "description": "See all the videos urza likes",
+ "title": 'Videos urza likes',
},
}
def _real_extract(self, url):
user_id = self._match_id(url)
- rss_url = '%s//vimeo.com/user%s/likes/rss' % (
- self.http_scheme(), user_id)
- surl = smuggle_url(rss_url, {
- 'force_videoid': '%s_likes' % user_id,
- 'to_generic': True,
- })
+ webpage = self._download_webpage(url, user_id)
+ page_count = self._int(
+ self._search_regex(
+ r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
+ .*?</a></li>\s*<li\s+class="pagination_next">
+ ''', webpage, 'page count'),
+ 'page count', fatal=True)
+ PAGE_SIZE = 12
+ title = self._html_search_regex(
+ r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
+ description = self._html_search_meta('description', webpage)
+
+ def _get_page(idx):
+ page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
+ self.http_scheme(), user_id, idx + 1)
+ webpage = self._download_webpage(
+ page_url, user_id,
+ note='Downloading page %d/%d' % (idx + 1, page_count))
+ video_list = self._search_regex(
+ r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
+ webpage, 'video content')
+ paths = re.findall(
+ r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
+ for path in paths:
+ yield {
+ '_type': 'url',
+ 'url': compat_urlparse.urljoin(page_url, path),
+ }
+
+ pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
return {
- '_type': 'url',
- 'url': surl,
+ '_type': 'playlist',
+ 'id': 'user%s_likes' % user_id,
+ 'title': title,
+ 'description': description,
+ 'entries': pl,
}
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index fb0600f1a..ec3c010ad 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
+ ExtractorError,
parse_duration,
qualities,
)
@@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor):
_VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
- 'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
+ 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html',
'info_dict': {
- 'id': '843902317',
+ 'id': '922692425',
'ext': '3gp',
- 'title': 'Movie Trailer: Noah',
- 'duration': 139,
+ 'title': 'The Toy Soldiers - Hollywood Movie Trailer',
+ 'duration': 180,
}
}
@@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor):
webpage = self._download_webpage(
adfree_url, video_id, note='Download post-ad page')
+ error_msg = self._html_search_regex(
+ r'<p class="message">(.*?)</p>', webpage, 'error message',
+ default=None)
+ if error_msg:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+
+ # These clowns alternate between two page types
links_code = self._search_regex(
- r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage,
- 'links')
+ r'''(?xs)
+ (?:
+ <img\s+src="/im/play.gif".*?>|
+ <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
+ )
+ (.*?)
+ (?:
+ <a\s+href="fblike|<div\s+class="social">
+ )
+ ''', webpage, 'links')
title = self._html_search_regex(
r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()
quality_order = qualities(['Reg', 'Hi'])
formats = []
for url, q in re.findall(
- r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
+ r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code):
format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
formats.append({
'format_id': format_id,
@@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor):
self._sort_formats(formats)
duration = parse_duration(self._search_regex(
- r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))
+ r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index 4e89acd81..bda3870db 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor):
"info_dict": {
"id": "wshh6a7q1ny0G34ZwuIO",
"ext": "mp4",
- "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+ "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- webpage_src = self._download_webpage(url, video_id)
-
- m_vevo_id = re.search(r'videoId=(.*?)&amp?',
- webpage_src)
+ m_vevo_id = re.search(r'videoId=(.*?)&amp?', webpage)
if m_vevo_id is not None:
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
video_url = self._search_regex(
- r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
+ r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
if 'youtube' in video_url:
return self.url_result(video_url, ie='Youtube')
video_title = self._html_search_regex(
- r"<title>(.*)</title>", webpage_src, 'title')
+ r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+ webpage, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(
- r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
+ r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
fatal=False)
if not thumbnail:
- _title = r"""candytitles.*>(.*)</span>"""
- mobj = re.search(_title, webpage_src)
+ _title = r'candytitles.*>(.*)</span>'
+ mobj = re.search(_title, webpage)
if mobj is not None:
video_title = mobj.group(1)
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 3ab6017cd..221341c13 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -38,16 +38,6 @@ class YahooIE(InfoExtractor):
},
},
{
- 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html',
- 'md5': '410b7104aa9893b765bc22787a22f3d9',
- 'info_dict': {
- 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845',
- 'ext': 'mp4',
- 'title': 'The World Loves Spider-Man',
- 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
- }
- },
- {
'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
'md5': '60e8ac193d8fb71997caa8fce54c6460',
'info_dict': {
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 24872861a..944d7da38 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -13,7 +13,7 @@ class YnetIE(InfoExtractor):
_TESTS = [
{
'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
- 'md5': '002b44ee2f33d50363a1c153bed524cf',
+ 'md5': '4b29cb57c3dddd57642b3f051f535b07',
'info_dict': {
'id': 'L-11659-99244',
'ext': 'flv',
@@ -22,7 +22,7 @@ class YnetIE(InfoExtractor):
}
}, {
'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
- 'md5': '6455046ae1b48cf7e2b7cae285e53a16',
+ 'md5': '8194c2ea221e9a639cac96b6b0753dc5',
'info_dict': {
'id': 'L-8859-84418',
'ext': 'flv',
@@ -33,9 +33,7 @@ class YnetIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 99198e380..9041cfa87 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -26,7 +26,7 @@ from ..utils import (
get_element_by_attribute,
ExtractorError,
int_or_none,
- PagedList,
+ OnDemandPagedList,
unescapeHTML,
unified_strdate,
orderedSet,
@@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Get video webpage
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+ pref_cookies = [
+ c for c in self._downloader.cookiejar
+ if c.domain == '.youtube.com' and c.name == 'PREF']
+ for pc in pref_cookies:
+ if 'hl=' in pc.value:
+ pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
+ else:
+ if pc.value:
+ pc.value += '&'
+ pc.value += 'hl=en'
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor):
'id': video_id,
'title': title,
}
- url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
+ url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username)
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 44dcb1e34..f651337ad 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
try:
i = opts.index(private_opt)
- opts[i+1] = '<PRIVATE>'
+ opts[i+1] = 'PRIVATE'
except ValueError:
pass
return opts
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index b644f4e92..f8dd9c72d 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -673,6 +673,8 @@ class ExtractorError(Exception):
expected = True
if video_id is not None:
msg = video_id + ': ' + msg
+ if cause:
+ msg += u' (caused by %r)' % cause
if not expected:
msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
super(ExtractorError, self).__init__(msg)
@@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
del req.headers['User-agent']
req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
del req.headers['Youtubedl-user-agent']
+
+ if sys.version_info < (2, 7) and '#' in req.get_full_url():
+ # Python 2.6 is brain-dead when it comes to fragments
+ req._Request__original = req._Request__original.partition('#')[0]
+ req._Request__r_type = req._Request__r_type.partition('#')[0]
+
return req
def http_response(self, req, resp):
@@ -884,6 +892,7 @@ def unified_strdate(date_str):
'%d/%m/%Y',
'%d/%m/%y',
'%Y/%m/%d %H:%M:%S',
+ '%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y %H.%M',
@@ -1384,14 +1393,16 @@ def check_executable(exe, args=[]):
class PagedList(object):
- def __init__(self, pagefunc, pagesize):
- self._pagefunc = pagefunc
- self._pagesize = pagesize
-
def __len__(self):
# This is only useful for tests
return len(self.getslice())
+
+class OnDemandPagedList(PagedList):
+ def __init__(self, pagefunc, pagesize):
+ self._pagefunc = pagefunc
+ self._pagesize = pagesize
+
def getslice(self, start=0, end=None):
res = []
for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1441,35 @@ class PagedList(object):
return res
+class InAdvancePagedList(PagedList):
+ def __init__(self, pagefunc, pagecount, pagesize):
+ self._pagefunc = pagefunc
+ self._pagecount = pagecount
+ self._pagesize = pagesize
+
+ def getslice(self, start=0, end=None):
+ res = []
+ start_page = start // self._pagesize
+ end_page = (
+ self._pagecount if end is None else (end // self._pagesize + 1))
+ skip_elems = start - start_page * self._pagesize
+ only_more = None if end is None else end - start
+ for pagenum in range(start_page, end_page):
+ page = list(self._pagefunc(pagenum))
+ if skip_elems:
+ page = page[skip_elems:]
+ skip_elems = None
+ if only_more is not None:
+ if len(page) < only_more:
+ only_more -= len(page)
+ else:
+ page = page[:only_more]
+ res.extend(page)
+ break
+ res.extend(page)
+ return res
+
+
def uppercase_escape(s):
unicode_escape = codecs.getdecoder('unicode_escape')
return re.sub(
@@ -1540,27 +1580,24 @@ def strip_jsonp(code):
def js_to_json(code):
def fix_kv(m):
- key = m.group(2)
- if key.startswith("'"):
- assert key.endswith("'")
- assert '"' not in key
- key = '"%s"' % key[1:-1]
- elif not key.startswith('"'):
- key = '"%s"' % key
-
- value = m.group(4)
- if value.startswith("'"):
- assert value.endswith("'")
- assert '"' not in value
- value = '"%s"' % value[1:-1]
-
- return m.group(1) + key + m.group(3) + value
+ v = m.group(0)
+ if v in ('true', 'false', 'null'):
+ return v
+ if v.startswith('"'):
+ return v
+ if v.startswith("'"):
+ v = v[1:-1]
+ v = re.sub(r"\\\\|\\'|\"", lambda m: {
+ '\\\\': '\\\\',
+ "\\'": "'",
+ '"': '\\"',
+ }[m.group(0)], v)
+ return '"%s"' % v
res = re.sub(r'''(?x)
- ([{,]\s*)
- ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
- (:\s*)
- ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
+ "(?:[^"\\]*(?:\\\\|\\")?)*"|
+ '(?:[^'\\]*(?:\\\\|\\')?)*'|
+ [a-zA-Z_][a-zA-Z_0-9]*
''', fix_kv, code)
res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
return res
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index eb4356811..1384b496b 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2014.09.28.1'
+__version__ = '2014.09.29.2'