aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py13
-rw-r--r--youtube_dl/extractor/allocine.py89
-rw-r--r--youtube_dl/extractor/anitube.py32
-rw-r--r--youtube_dl/extractor/arte.py10
-rw-r--r--youtube_dl/extractor/common.py3
-rw-r--r--youtube_dl/extractor/dailymotion.py2
-rw-r--r--youtube_dl/extractor/drtv.py91
-rw-r--r--youtube_dl/extractor/generic.py14
-rw-r--r--youtube_dl/extractor/googleplus.py3
-rw-r--r--youtube_dl/extractor/ivi.py2
-rw-r--r--youtube_dl/extractor/livestream.py52
-rw-r--r--youtube_dl/extractor/motherless.py87
-rw-r--r--youtube_dl/extractor/mpora.py2
-rw-r--r--youtube_dl/extractor/newstube.py15
-rw-r--r--youtube_dl/extractor/niconico.py105
-rw-r--r--youtube_dl/extractor/ninegag.py2
-rw-r--r--youtube_dl/extractor/npo.py62
-rw-r--r--youtube_dl/extractor/rai.py3
-rw-r--r--youtube_dl/extractor/soundgasm.py40
-rw-r--r--youtube_dl/extractor/tagesschau.py10
-rw-r--r--youtube_dl/extractor/teachertube.py40
-rw-r--r--youtube_dl/extractor/toypics.py7
-rw-r--r--youtube_dl/extractor/tumblr.py23
-rw-r--r--youtube_dl/extractor/veoh.py1
-rw-r--r--youtube_dl/extractor/videott.py13
-rw-r--r--youtube_dl/extractor/vk.py52
-rw-r--r--youtube_dl/extractor/wdr.py15
-rw-r--r--youtube_dl/extractor/youtube.py100
28 files changed, 708 insertions, 180 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 1666aa372..7b3f9ae24 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -3,6 +3,7 @@ from .addanime import AddAnimeIE
from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
from .aol import AolIE
+from .allocine import AllocineIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
@@ -63,6 +64,7 @@ from .dailymotion import (
from .daum import DaumIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
+from .drtv import DRTVIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .divxstage import DivxStageIE
@@ -147,7 +149,11 @@ from .ku6 import Ku6IE
from .la7 import LA7IE
from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE
-from .livestream import LivestreamIE, LivestreamOriginalIE
+from .livestream import (
+ LivestreamIE,
+ LivestreamOriginalIE,
+ LivestreamShortenerIE,
+)
from .lynda import (
LyndaIE,
LyndaCourseIE
@@ -165,6 +171,7 @@ from .mpora import MporaIE
from .mofosex import MofosexIE
from .mooshare import MooshareIE
from .morningstar import MorningstarIE
+from .motherless import MotherlessIE
from .motorsport import MotorsportIE
from .moviezine import MoviezineIE
from .movshare import MovShareIE
@@ -197,6 +204,7 @@ from .normalboots import NormalbootsIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
+from .npo import NPOIE
from .nrk import (
NRKIE,
NRKTVIE,
@@ -255,6 +263,7 @@ from .soundcloud import (
SoundcloudUserIE,
SoundcloudPlaylistIE
)
+from .soundgasm import SoundgasmIE
from .southparkstudios import (
SouthParkStudiosIE,
SouthparkDeIE,
@@ -274,7 +283,7 @@ from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE
from .teachertube import (
TeacherTubeIE,
- TeacherTubeClassroomIE,
+ TeacherTubeUserIE,
)
from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py
new file mode 100644
index 000000000..34f0cd49b
--- /dev/null
+++ b/youtube_dl/extractor/allocine.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+ qualities,
+ determine_ext,
+)
+
+
+class AllocineIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P<id>[0-9]+)(?:\.html)?'
+
+ _TESTS = [{
+ 'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
+ 'md5': '0c9fcf59a841f65635fa300ac43d8269',
+ 'info_dict': {
+ 'id': '19546517',
+ 'ext': 'mp4',
+ 'title': 'Astérix - Le Domaine des Dieux Teaser VF',
+ 'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
+ 'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0',
+ 'info_dict': {
+ 'id': '19540403',
+ 'ext': 'mp4',
+ 'title': 'Planes 2 Bande-annonce VF',
+ 'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html',
+ 'md5': '101250fb127ef9ca3d73186ff22a47ce',
+ 'info_dict': {
+ 'id': '19544709',
+ 'ext': 'mp4',
+ 'title': 'Dragons 2 - Bande annonce finale VF',
+ 'description': 'md5:e74a4dc750894bac300ece46c7036490',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ typ = mobj.group('typ')
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ if typ == 'film':
+ video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id')
+ else:
+ player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player')
+
+ player_data = json.loads(player)
+ video_id = compat_str(player_data['refMedia'])
+
+ xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id)
+
+ video = xml.find('.//AcVisionVideo').attrib
+ quality = qualities(['ld', 'md', 'hd'])
+
+ formats = []
+ for k, v in video.items():
+ if re.match(r'.+_path', k):
+ format_id = k.split('_')[0]
+ formats.append({
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ 'url': v,
+ 'ext': determine_ext(v),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video['videoTitle'],
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py
index 2b019daa9..31f0d417c 100644
--- a/youtube_dl/extractor/anitube.py
+++ b/youtube_dl/extractor/anitube.py
@@ -1,22 +1,24 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
class AnitubeIE(InfoExtractor):
- IE_NAME = u'anitube.se'
+ IE_NAME = 'anitube.se'
_VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.anitube.se/video/36621',
- u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
- u'file': u'36621.mp4',
- u'info_dict': {
- u'id': u'36621',
- u'ext': u'mp4',
- u'title': u'Recorder to Randoseru 01',
+ 'url': 'http://www.anitube.se/video/36621',
+ 'md5': '59d0eeae28ea0bc8c05e7af429998d43',
+ 'info_dict': {
+ 'id': '36621',
+ 'ext': 'mp4',
+ 'title': 'Recorder to Randoseru 01',
+ 'duration': 180.19,
},
- u'skip': u'Blocked in the US',
+ 'skip': 'Blocked in the US',
}
def _real_extract(self, url):
@@ -24,13 +26,15 @@ class AnitubeIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
- webpage, u'key')
+ key = self._html_search_regex(
+ r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key')
- config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
- key)
+ config_xml = self._download_xml(
+ 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key)
video_title = config_xml.find('title').text
+ thumbnail = config_xml.find('image').text
+ duration = float(config_xml.find('duration').text)
formats = []
video_url = config_xml.find('file')
@@ -49,5 +53,7 @@ class AnitubeIE(InfoExtractor):
return {
'id': video_id,
'title': video_title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
'formats': formats
}
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index b528a9ec5..9591bad8a 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -39,7 +39,10 @@ class ArteTvIE(InfoExtractor):
formats = [{
'forma_id': q.attrib['quality'],
- 'url': q.text,
+ # The playpath starts at 'mp4:', if we don't manually
+ # split the url, rtmpdump will incorrectly parse them
+ 'url': q.text.split('mp4:', 1)[0],
+ 'play_path': 'mp4:' + q.text.split('mp4:', 1)[1],
'ext': 'flv',
'quality': 2 if q.attrib['quality'] == 'hd' else 1,
} for q in config.findall('./urls/url')]
@@ -111,7 +114,7 @@ class ArteTVPlus7IE(InfoExtractor):
if not formats:
# Some videos are only available in the 'Originalversion'
# they aren't tagged as being in French or German
- if all(f['versionCode'] == 'VO' for f in all_formats):
+ if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats):
formats = all_formats
else:
raise ExtractorError(u'The formats list is empty')
@@ -189,9 +192,10 @@ class ArteTVFutureIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
'info_dict': {
- 'id': '050940-003',
+ 'id': '5201',
'ext': 'mp4',
'title': 'Les champignons au secours de la planète',
+ 'upload_date': '20131101',
},
}
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 49e75405e..e4e4feef9 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -459,6 +459,9 @@ class InfoExtractor(object):
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
+ def _og_search_url(self, html, **kargs):
+ return self._og_search_property('url', html, **kargs)
+
def _html_search_meta(self, name, html, display_name=None, fatal=False):
if display_name is None:
display_name = name
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 55216201f..5d0bfe454 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -150,7 +150,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
return {
'id': video_id,
'formats': formats,
- 'uploader': info['owner_screenname'],
+ 'uploader': info['owner.screenname'],
'upload_date': video_upload_date,
'title': self._og_search_title(webpage),
'subtitles': video_subtitles,
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
new file mode 100644
index 000000000..cdccfd376
--- /dev/null
+++ b/youtube_dl/extractor/drtv.py
@@ -0,0 +1,91 @@
+from __future__ import unicode_literals
+
+import re
+
+from .subtitles import SubtitlesInfoExtractor
+from .common import ExtractorError
+from ..utils import parse_iso8601
+
+
+class DRTVIE(SubtitlesInfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/[^/]+/(?P<id>[\da-z-]+)'
+
+ _TEST = {
+ 'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8',
+ 'md5': '4a7e1dd65cdb2643500a3f753c942f25',
+ 'info_dict': {
+ 'id': 'partiets-mand-7-8',
+ 'ext': 'mp4',
+ 'title': 'Partiets mand (7:8)',
+ 'description': 'md5:a684b90a8f9336cd4aab94b7647d7862',
+ 'timestamp': 1403047940,
+ 'upload_date': '20140617',
+ 'duration': 1299.040,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ programcard = self._download_json(
+ 'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, video_id, 'Downloading video JSON')
+
+ data = programcard['Data'][0]
+
+ title = data['Title']
+ description = data['Description']
+ timestamp = parse_iso8601(data['CreatedTime'][:-5])
+
+ thumbnail = None
+ duration = None
+
+ restricted_to_denmark = False
+
+ formats = []
+ subtitles = {}
+
+ for asset in data['Assets']:
+ if asset['Kind'] == 'Image':
+ thumbnail = asset['Uri']
+ elif asset['Kind'] == 'VideoResource':
+ duration = asset['DurationInMilliseconds'] / 1000.0
+ restricted_to_denmark = asset['RestrictedToDenmark']
+ for link in asset['Links']:
+ target = link['Target']
+ uri = link['Uri']
+ formats.append({
+ 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
+ 'format_id': target,
+ 'ext': link['FileFormat'],
+ 'preference': -1 if target == 'HDS' else -2,
+ })
+ subtitles_list = asset.get('SubtitlesList')
+ if isinstance(subtitles_list, list):
+ LANGS = {
+ 'Danish': 'dk',
+ }
+ for subs in subtitles_list:
+ lang = subs['Language']
+ subtitles[LANGS.get(lang, lang)] = subs['Uri']
+
+ if not formats and restricted_to_denmark:
+ raise ExtractorError(
+ 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True)
+
+ self._sort_formats(formats)
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': self.extract_subtitles(video_id, subtitles),
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 9dd03aba4..f97b59845 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -383,7 +383,7 @@ class GenericIE(InfoExtractor):
if not parsed_url.scheme:
default_search = self._downloader.params.get('default_search')
if default_search is None:
- default_search = 'auto_warning'
+ default_search = 'error'
if default_search in ('auto', 'auto_warning'):
if '/' in url:
@@ -397,8 +397,13 @@ class GenericIE(InfoExtractor):
expected=True)
else:
self._downloader.report_warning(
- 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url)
+ 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
return self.url_result('ytsearch:' + url)
+ elif default_search == 'error':
+ raise ExtractorError(
+ ('%r is not a valid URL. '
+ 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
+ ) % (url, url), expected=True)
else:
assert ':' in default_search
return self.url_result(default_search + url)
@@ -620,6 +625,11 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'VK')
+ # Look for embedded ivi player
+ mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Ivi')
+
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index cc29a7e5d..07d994b44 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -52,8 +52,7 @@ class GooglePlusIE(InfoExtractor):
# Extract title
# Get the first line for title
- video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
- webpage, 'title', default='NA')
+ video_title = self._og_search_description(webpage).splitlines()[0]
# Step 2, Simulate clicking the image box to launch video
DOMAIN = 'https://plus.google.com/'
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 528be1524..4027deb70 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -14,7 +14,7 @@ from ..utils import (
class IviIE(InfoExtractor):
IE_DESC = 'ivi.ru'
IE_NAME = 'ivi'
- _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<videoid>\d+)'
_TESTS = [
# Single movie
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 5c71f4f09..2c100d424 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -9,6 +9,7 @@ from ..utils import (
compat_urlparse,
xpath_with_ns,
compat_str,
+ orderedSet,
)
@@ -64,7 +65,10 @@ class LivestreamIE(InfoExtractor):
# The original version of Livestream uses a different system
class LivestreamOriginalIE(InfoExtractor):
IE_NAME = 'livestream:original'
- _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)'
+ _VALID_URL = r'''(?x)https?://www\.livestream\.com/
+ (?P<user>[^/]+)/(?P<type>video|folder)
+ (?:\?.*?Id=|/)(?P<id>.*?)(&|$)
+ '''
_TEST = {
'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
'info_dict': {
@@ -78,10 +82,7 @@ class LivestreamOriginalIE(InfoExtractor):
},
}
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- user = mobj.group('user')
+ def _extract_video(self, user, video_id):
api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
info = self._download_xml(api_url, video_id)
@@ -99,3 +100,44 @@ class LivestreamOriginalIE(InfoExtractor):
'ext': 'flv',
'thumbnail': thumbnail_url,
}
+
+ def _extract_folder(self, url, folder_id):
+ webpage = self._download_webpage(url, folder_id)
+ urls = orderedSet(re.findall(r'<a href="(https?://livestre\.am/.*?)"', webpage))
+
+ return {
+ '_type': 'playlist',
+ 'id': folder_id,
+ 'entries': [{
+ '_type': 'url',
+ 'url': video_url,
+ } for video_url in urls],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ id = mobj.group('id')
+ user = mobj.group('user')
+ url_type = mobj.group('type')
+ if url_type == 'folder':
+ return self._extract_folder(url, id)
+ else:
+ return self._extract_video(user, id)
+
+
+# The server doesn't support HEAD request, the generic extractor can't detect
+# the redirection
+class LivestreamShortenerIE(InfoExtractor):
+ IE_NAME = 'livestream:shortener'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://livestre\.am/(?P<id>.+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ id = mobj.group('id')
+ webpage = self._download_webpage(url, id)
+
+ return {
+ '_type': 'url',
+ 'url': self._og_search_url(webpage),
+ }
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
new file mode 100644
index 000000000..6229b2173
--- /dev/null
+++ b/youtube_dl/extractor/motherless.py
@@ -0,0 +1,87 @@
+from __future__ import unicode_literals
+
+import datetime
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+
+
+class MotherlessIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)'
+ _TESTS = [
+ {
+ 'url': 'http://motherless.com/AC3FFE1',
+ 'md5': '5527fef81d2e529215dad3c2d744a7d9',
+ 'info_dict': {
+ 'id': 'AC3FFE1',
+ 'ext': 'flv',
+ 'title': 'Fucked in the ass while playing PS3',
+ 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+ 'upload_date': '20100913',
+ 'uploader_id': 'famouslyfuckedup',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
+ },
+ {
+ 'url': 'http://motherless.com/532291B',
+ 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+ 'info_dict': {
+ 'id': '532291B',
+ 'ext': 'mp4',
+ 'title': 'Amazing girl playing the omegle game, PERFECT!',
+ 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'],
+ 'upload_date': '20140622',
+ 'uploader_id': 'Sulivana7x',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }
+ ]
+
+ def _real_extract(self,url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
+
+ video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url')
+ age_limit = self._rta_search(webpage)
+
+ view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count')
+
+ upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date')
+ if 'Ago' in upload_date:
+ days = int(re.search(r'([0-9]+)', upload_date).group(1))
+ upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
+ else:
+ upload_date = unified_strdate(upload_date)
+
+ like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count')
+
+ comment_count = webpage.count('class="media-comment-contents"')
+ uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id')
+
+ categories = self._html_search_meta('keywords', webpage)
+ if categories:
+ categories = [cat.strip() for cat in categories.split(',')]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'categories': categories,
+ 'view_count': int_or_none(view_count.replace(',', '')),
+ 'like_count': int_or_none(like_count.replace(',', '')),
+ 'comment_count': comment_count,
+ 'age_limit': age_limit,
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
index 39d6feb98..387935d4d 100644
--- a/youtube_dl/extractor/mpora.py
+++ b/youtube_dl/extractor/mpora.py
@@ -28,7 +28,7 @@ class MporaIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
data_json = self._search_regex(
- r"new FM\.Player\('[^']+',\s*(\{.*?)\);\n", webpage, 'json')
+ r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", webpage, 'json')
data = json.loads(data_json)
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
index 2fd5b8f04..551bd4d7a 100644
--- a/youtube_dl/extractor/newstube.py
+++ b/youtube_dl/extractor/newstube.py
@@ -4,18 +4,19 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import ExtractorError
class NewstubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'
_TEST = {
- 'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs',
+ 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym',
'info_dict': {
- 'id': 'd156a237-a6e9-4111-a682-039995f721f1',
+ 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',
'ext': 'flv',
- 'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»',
- 'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77',
- 'duration': 20.04,
+ 'title': 'Телеканал CNN переместил город Славянск в Крым',
+ 'description': 'md5:419a8c9f03442bc0b0a794d689360335',
+ 'duration': 31.05,
},
'params': {
# rtmp download
@@ -40,6 +41,10 @@ class NewstubeIE(InfoExtractor):
def ns(s):
return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'}
+ error_message = player.find(ns('./ErrorMessage'))
+ if error_message is not None:
+ raise ExtractorError('%s returned error: %s' % (self.IE_NAME, error_message.text), expected=True)
+
session_id = player.find(ns('./SessionId')).text
media_info = player.find(ns('./Medias/MediaInfo'))
title = media_info.find(ns('./Name')).text
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 517a72561..c0c139b5d 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -8,10 +8,9 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
- compat_str,
-
- ExtractorError,
unified_strdate,
+ parse_duration,
+ int_or_none,
)
@@ -30,6 +29,7 @@ class NiconicoIE(InfoExtractor):
'uploader_id': '2698420',
'upload_date': '20131123',
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ 'duration': 33,
},
'params': {
'username': 'ydl.niconico@gmail.com',
@@ -37,17 +37,20 @@ class NiconicoIE(InfoExtractor):
},
}
- _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+ _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
+ # Determine whether the downloader uses authentication to download video
+ _AUTHENTICATE = False
def _real_initialize(self):
- self._login()
+ if self._downloader.params.get('username', None) is not None:
+ self._AUTHENTICATE = True
+
+ if self._AUTHENTICATE:
+ self._login()
def _login(self):
(username, password) = self._get_login_info()
- if username is None:
- # Login is required
- raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
# Log in
login_form_strs = {
@@ -79,44 +82,66 @@ class NiconicoIE(InfoExtractor):
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note='Downloading video info page')
- # Get flv info
- flv_info_webpage = self._download_webpage(
- 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
- video_id, 'Downloading flv info')
+ if self._AUTHENTICATE:
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, 'Downloading flv info')
+ else:
+ # Get external player info
+ ext_player_info = self._download_webpage(
+ 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id)
+ thumb_play_key = self._search_regex(
+ r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey')
+
+ # Get flv info
+ flv_info_data = compat_urllib_parse.urlencode({
+ 'k': thumb_play_key,
+ 'v': video_id
+ })
+ flv_info_request = compat_urllib_request.Request(
+ 'http://ext.nicovideo.jp/thumb_watch', flv_info_data,
+ {'Content-Type': 'application/x-www-form-urlencoded'})
+ flv_info_webpage = self._download_webpage(
+ flv_info_request, video_id,
+ note='Downloading flv info', errnote='Unable to download flv info')
+
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
- video_title = video_info.find('.//title').text
- video_extension = video_info.find('.//movie_type').text
- video_format = video_extension.upper()
- video_thumbnail = video_info.find('.//thumbnail_url').text
- video_description = video_info.find('.//description').text
- video_uploader_id = video_info.find('.//user_id').text
- video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
- video_view_count = video_info.find('.//view_counter').text
- video_webpage_url = video_info.find('.//watch_url').text
-
- # uploader
- video_uploader = video_uploader_id
- url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
- try:
- user_info = self._download_xml(
- url, video_id, note='Downloading user information')
- video_uploader = user_info.find('.//nickname').text
- except ExtractorError as err:
- self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
+ title = video_info.find('.//title').text
+ extension = video_info.find('.//movie_type').text
+ video_format = extension.upper()
+ thumbnail = video_info.find('.//thumbnail_url').text
+ description = video_info.find('.//description').text
+ upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+ view_count = int_or_none(video_info.find('.//view_counter').text)
+ comment_count = int_or_none(video_info.find('.//comment_num').text)
+ duration = parse_duration(video_info.find('.//length').text)
+ webpage_url = video_info.find('.//watch_url').text
+
+ if video_info.find('.//ch_id') is not None:
+ uploader_id = video_info.find('.//ch_id').text
+ uploader = video_info.find('.//ch_name').text
+ elif video_info.find('.//user_id') is not None:
+ uploader_id = video_info.find('.//user_id').text
+ uploader = video_info.find('.//user_nickname').text
+ else:
+ uploader_id = uploader = None
return {
'id': video_id,
'url': video_real_url,
- 'title': video_title,
- 'ext': video_extension,
+ 'title': title,
+ 'ext': extension,
'format': video_format,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'uploader': video_uploader,
- 'upload_date': video_upload_date,
- 'uploader_id': video_uploader_id,
- 'view_count': video_view_count,
- 'webpage_url': video_webpage_url,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'duration': duration,
+ 'webpage_url': webpage_url,
}
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index c2e7b67c7..33daa0dec 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -47,7 +47,7 @@ class NineGagIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
post_view = json.loads(self._html_search_regex(
- r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view'))
+ r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'))
youtube_id = post_view['videoExternalId']
title = post_view['title']
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
new file mode 100644
index 000000000..fbcbe1f40
--- /dev/null
+++ b/youtube_dl/extractor/npo.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+)
+
+
+class NPOIE(InfoExtractor):
+ IE_NAME = 'npo.nl'
+ _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P<id>[^/?]+)'
+
+ _TEST = {
+ 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
+ 'md5': '4b3f9c429157ec4775f2c9cb7b911016',
+ 'info_dict': {
+ 'id': 'VPWON_1220719',
+ 'ext': 'mp4',
+ 'title': 'Nieuwsuur',
+ 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
+ 'upload_date': '20140622',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ metadata = self._download_json(
+ 'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+ video_id,
+ # We have to remove the javascript callback
+ transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j)
+ )
+ token_page = self._download_webpage(
+ 'http://ida.omroep.nl/npoplayer/i.js',
+ video_id,
+ note='Downloading token'
+ )
+ token = self._search_regex(r'npoplayer.token = "(.+?)"', token_page, 'token')
+ streams_info = self._download_json(
+ 'http://ida.omroep.nl/odi/?prid=%s&puboptions=h264_std&adaptive=yes&token=%s' % (video_id, token),
+ video_id
+ )
+
+ stream_info = self._download_json(
+ streams_info['streams'][0] + '&type=json',
+ video_id,
+ 'Downloading stream info'
+ )
+
+ return {
+ 'id': video_id,
+ 'title': metadata['titel'],
+ 'ext': 'mp4',
+ 'url': stream_info['url'],
+ 'description': metadata['info'],
+ 'thumbnail': metadata['images'][-1]['url'],
+ 'upload_date': unified_strdate(metadata['gidsdatum']),
+ }
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index cb4305349..ba3dd707f 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -35,7 +35,8 @@ class RaiIE(SubtitlesInfoExtractor):
'description': '',
'upload_date': '20140612',
'duration': 1758,
- }
+ },
+ 'skip': 'Error 404',
},
{
'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py
new file mode 100644
index 000000000..a4f8ce6c3
--- /dev/null
+++ b/youtube_dl/extractor/soundgasm.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SoundgasmIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)'
+ _TEST = {
+ 'url': 'http://soundgasm.net/u/ytdl/Piano-sample',
+ 'md5': '010082a2c802c5275bb00030743e75ad',
+ 'info_dict': {
+ 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9',
+ 'ext': 'm4a',
+ 'title': 'ytdl_Piano-sample',
+ 'description': 'Royalty Free Sample Music'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('title')
+ audio_title = mobj.group('user') + '_' + mobj.group('title')
+ webpage = self._download_webpage(url, display_id)
+ audio_url = self._html_search_regex(
+ r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL')
+ audio_id = re.split('\/|\.', audio_url)[-2]
+ description = self._html_search_regex(
+ r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description',
+ fatal=False)
+
+ return {
+ 'id': audio_id,
+ 'display_id': display_id,
+ 'url': audio_url,
+ 'title': audio_title,
+ 'description': description
+ }
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index 36331529e..25b9864ad 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -20,13 +20,13 @@ class TagesschauIE(InfoExtractor):
'thumbnail': 're:^http:.*\.jpg$',
},
}, {
- 'url': 'http://www.tagesschau.de/multimedia/video/video-196.html',
- 'md5': '8aaa8bf3ae1ca2652309718c03019128',
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html',
+ 'md5': '66652566900963a3f962333579eeffcf',
'info_dict': {
- 'id': '196',
+ 'id': '5964',
'ext': 'mp4',
- 'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt',
- 'description': 'md5:f22e4af75821d174fa6c977349682691',
+ 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland',
+ 'description': 'md5:07bfc78c48eec3145ed4805299a1900a',
'thumbnail': 're:http://.*\.jpg',
},
}]
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
index b3cb6bd76..2c2113b14 100644
--- a/youtube_dl/extractor/teachertube.py
+++ b/youtube_dl/extractor/teachertube.py
@@ -14,7 +14,7 @@ class TeacherTubeIE(InfoExtractor):
IE_NAME = 'teachertube'
IE_DESC = 'teachertube.com videos'
- _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997',
@@ -45,6 +45,15 @@ class TeacherTubeIE(InfoExtractor):
'title': 'PER ASPERA AD ASTRA',
'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P',
},
+ }, {
+ 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790',
+ 'md5': '9c79fbb2dd7154823996fc28d4a26998',
+ 'info_dict': {
+ 'id': '297790',
+ 'ext': 'mp4',
+ 'title': 'Intro Video - Schleicher',
+ 'description': 'Intro Video - Why to flip, how flipping will',
+ },
}]
def _real_extract(self, url):
@@ -66,6 +75,7 @@ class TeacherTubeIE(InfoExtractor):
media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage)
media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage))
+ media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage))
formats = [
{
@@ -79,28 +89,36 @@ class TeacherTubeIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': self._html_search_regex(r'var\s+thumbUrl\s*=\s*"([^"]+)"', webpage, 'thumbnail'),
+ 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'),
'formats': formats,
'description': description,
}
-class TeacherTubeClassroomIE(InfoExtractor):
- IE_NAME = 'teachertube:classroom'
- IE_DESC = 'teachertube.com online classrooms'
+class TeacherTubeUserIE(InfoExtractor):
+ IE_NAME = 'teachertube:user:collection'
+ IE_DESC = 'teachertube.com user and collection videos'
+
+ _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?'
- _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)'
+ _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user_id = mobj.group('user')
- rss = self._download_xml(
- 'http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id,
- user_id, 'Downloading classroom RSS')
+ urls = []
+ webpage = self._download_webpage(url, user_id)
+ urls.extend(re.findall(self._MEDIA_RE, webpage))
+
+ pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1]
+ for p in pages:
+ more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p)
+ webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1))
+ urls.extend(re.findall(self._MEDIA_RE, webpage))
entries = []
- for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'):
- entries.append(self.url_result(url.attrib['url'], 'TeacherTube'))
+ for url in urls:
+ entries.append(self.url_result(url, 'TeacherTube'))
return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
index 34008afc6..0f389bd93 100644
--- a/youtube_dl/extractor/toypics.py
+++ b/youtube_dl/extractor/toypics.py
@@ -1,10 +1,13 @@
+# -*- coding:utf-8 -*-
+from __future__ import unicode_literals
+
from .common import InfoExtractor
import re
class ToypicsIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
- _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
+ _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
_TEST = {
'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
'md5': '16e806ad6d6f58079d210fe30985e08b',
@@ -61,7 +64,7 @@ class ToypicsUserIE(InfoExtractor):
note='Downloading page %d/%d' % (n, page_count))
urls.extend(
re.findall(
- r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
+ r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">',
lpage))
return {
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 544369068..2882c1809 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
@@ -10,14 +11,27 @@ from ..utils import (
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)'
- _TEST = {
+ _TESTS = [{
'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
- 'file': '54196191430.mp4',
'md5': '479bb068e5b16462f5176a6828829767',
'info_dict': {
- "title": "tatiana maslany news"
+ 'id': '54196191430',
+ 'ext': 'mp4',
+ 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
+ 'description': 'md5:dfac39636969fe6bf1caa2d50405f069',
+ 'thumbnail': 're:http://.*\.jpg',
}
- }
+ }, {
+ 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all',
+ 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359',
+ 'info_dict': {
+ 'id': '90208453769',
+ 'ext': 'mp4',
+ 'title': '5SOS STRUM ;)',
+ 'description': 'md5:dba62ac8639482759c8eb10ce474586a',
+ 'thumbnail': 're:http://.*\.jpg',
+ }
+ }]
def _real_extract(self, url):
m_url = re.match(self._VALID_URL, url)
@@ -48,6 +62,7 @@ class TumblrIE(InfoExtractor):
return [{'id': video_id,
'url': video_url,
'title': video_title,
+ 'description': self._html_search_meta('description', webpage),
'thumbnail': video_thumbnail,
'ext': ext
}]
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index fb132aef6..a7953a7e7 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -49,6 +49,7 @@ class VeohIE(InfoExtractor):
'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
'uploader': 'newsy-videos',
},
+ 'skip': 'This video has been deleted.',
},
]
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index b5034b02f..a647807d0 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -4,7 +4,10 @@ import re
import base64
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ unified_strdate,
+ int_or_none,
+)
class VideoTtIE(InfoExtractor):
@@ -50,9 +53,9 @@ class VideoTtIE(InfoExtractor):
'thumbnail': settings['config']['thumbnail'],
'upload_date': unified_strdate(video['added']),
'uploader': video['owner'],
- 'view_count': int(video['view_count']),
- 'comment_count': int(video['comment_count']),
- 'like_count': int(video['liked']),
- 'dislike_count': int(video['disliked']),
+ 'view_count': int_or_none(video['view_count']),
+ 'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']),
+ 'like_count': int_or_none(video['liked']),
+ 'dislike_count': int_or_none(video['disliked']),
'formats': formats,
} \ No newline at end of file
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index fb082f364..918bd1098 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
- _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
+ _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
@@ -27,7 +27,7 @@ class VKIE(InfoExtractor):
'id': '162222515',
'ext': 'flv',
'title': 'ProtivoGunz - Хуёвая песня',
- 'uploader': 'Noize MC',
+ 'uploader': 're:Noize MC.*',
'duration': 195,
},
},
@@ -62,11 +62,47 @@ class VKIE(InfoExtractor):
'id': '164049491',
'ext': 'mp4',
'uploader': 'Триллеры',
- 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0',
+ 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352,
},
'skip': 'Requires vk account credentials',
},
+ {
+ 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
+ 'md5': 'd82c22e449f036282d1d3f7f4d276869',
+ 'info_dict': {
+ 'id': '166094326',
+ 'ext': 'mp4',
+ 'uploader': 'Киномания - лучшее из мира кино',
+ 'title': 'Запах женщины (1992)',
+ 'duration': 9392,
+ },
+ 'skip': 'Requires vk account credentials',
+ },
+ {
+ 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
+ 'md5': '4d7a5ef8cf114dfa09577e57b2993202',
+ 'info_dict': {
+ 'id': '168067957',
+ 'ext': 'mp4',
+ 'uploader': 'Киномания - лучшее из мира кино',
+ 'title': ' ',
+ 'duration': 7291,
+ },
+ 'skip': 'Requires vk account credentials',
+ },
+ {
+ 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540',
+ 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',
+ 'note': 'ivi.ru embed',
+ 'info_dict': {
+ 'id': '60690',
+ 'ext': 'mp4',
+ 'title': 'Книга Илая',
+ 'duration': 6771,
+ },
+ 'skip': 'Only works from Russia',
+ },
]
def _login(self):
@@ -110,6 +146,16 @@ class VKIE(InfoExtractor):
if m_yt is not None:
self.to_screen('Youtube video detected')
return self.url_result(m_yt.group(1), 'Youtube')
+
+ m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+ if m_opts:
+ m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+ if m_opts_url:
+ opts_url = m_opts_url.group(1)
+ if opts_url.startswith('//'):
+ opts_url = 'http:' + opts_url
+ return self.url_result(opts_url)
+
data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
data = json.loads(data_json)
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index feeb44b45..f741ba540 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
@@ -54,14 +55,14 @@ class WDRIE(InfoExtractor):
},
},
{
- 'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html',
- 'md5': 'cfff440d4ee64114083ac44676df5d15',
+ 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html',
+ 'md5': '24e83813e832badb0a8d7d1ef9ef0691',
'info_dict': {
- 'id': 'mdb-363068',
+ 'id': 'mdb-463528',
'ext': 'mp3',
- 'title': 'Grenzenlos lecker - Baklava',
+ 'title': 'Süpersong: Soul Bossa Nova',
'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
- 'upload_date': '20140311',
+ 'upload_date': '20140630',
},
},
]
@@ -127,9 +128,10 @@ class WDRMobileIE(InfoExtractor):
'info_dict': {
'title': '4283021',
'id': '421735',
+ 'ext': 'mp4',
'age_limit': 0,
},
- '_skip': 'Will be depublicized shortly'
+ 'skip': 'Problems with loading data.'
}
def _real_extract(self, url):
@@ -139,6 +141,7 @@ class WDRMobileIE(InfoExtractor):
'title': mobj.group('title'),
'age_limit': int(mobj.group('age_limit')),
'url': url,
+ 'ext': determine_ext(url),
'user_agent': 'mobile',
}
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 6bdea1c44..6123e1256 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -865,71 +865,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
"""Turn the encrypted s field into a working signature"""
- if player_url is not None:
- if player_url.startswith(u'//'):
- player_url = u'https:' + player_url
- try:
- player_id = (player_url, len(s))
- if player_id not in self._player_cache:
- func = self._extract_signature_function(
- video_id, player_url, len(s)
- )
- self._player_cache[player_id] = func
- func = self._player_cache[player_id]
- if self._downloader.params.get('youtube_print_sig_code'):
- self._print_sig_code(func, len(s))
- return func(s)
- except Exception:
- tb = traceback.format_exc()
- self._downloader.report_warning(
- u'Automatic signature extraction failed: ' + tb)
-
- self._downloader.report_warning(
- u'Warning: Falling back to static signature algorithm')
-
- return self._static_decrypt_signature(
- s, video_id, player_url, age_gate)
-
- def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
- if age_gate:
- # The videos with age protection use another player, so the
- # algorithms can be different.
- if len(s) == 86:
- return s[2:63] + s[82] + s[64:82] + s[63]
-
- if len(s) == 93:
- return s[86:29:-1] + s[88] + s[28:5:-1]
- elif len(s) == 92:
- return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
- elif len(s) == 91:
- return s[84:27:-1] + s[86] + s[26:5:-1]
- elif len(s) == 90:
- return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
- elif len(s) == 89:
- return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
- elif len(s) == 88:
- return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
- elif len(s) == 87:
- return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
- elif len(s) == 86:
- return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
- elif len(s) == 85:
- return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
- elif len(s) == 84:
- return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
- elif len(s) == 83:
- return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
- elif len(s) == 82:
- return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
- elif len(s) == 81:
- return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
- elif len(s) == 80:
- return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
- elif len(s) == 79:
- return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
+ if player_url is None:
+ raise ExtractorError(u'Cannot decrypt signature without player_url')
- else:
- raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
+ if player_url.startswith(u'//'):
+ player_url = u'https:' + player_url
+ try:
+ player_id = (player_url, len(s))
+ if player_id not in self._player_cache:
+ func = self._extract_signature_function(
+ video_id, player_url, len(s)
+ )
+ self._player_cache[player_id] = func
+ func = self._player_cache[player_id]
+ if self._downloader.params.get('youtube_print_sig_code'):
+ self._print_sig_code(func, len(s))
+ return func(s)
+ except Exception as e:
+ tb = traceback.format_exc()
+ raise ExtractorError(
+ u'Automatic signature extraction failed: ' + tb, cause=e)
def _get_available_subtitles(self, video_id, webpage):
try:
@@ -1698,14 +1653,14 @@ class YoutubeSearchURLIE(InfoExtractor):
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
- r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+ r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
part_codes = re.findall(
r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
entries = []
for part_code in part_codes:
part_title = self._html_search_regex(
- r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+ [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
part_url_snippet = self._html_search_regex(
r'(?s)href="([^"]+)"', part_code, 'item URL')
part_url = compat_urlparse.urljoin(
@@ -1825,10 +1780,21 @@ class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
_VALID_URL = r'''(?x)
- (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
+ (?:https?://)?[^/]+/watch\?(?:
+ feature=[a-z_]+|
+ annotation_id=annotation_[^&]+
+ )?$|
(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
'''
+ _TESTS = [{
+ 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.youtube.com/watch?',
+ 'only_matching': True,
+ }]
+
def _real_extract(self, url):
raise ExtractorError(
u'Did you forget to quote the URL? Remember that & is a meta '