aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/biqle.py39
-rw-r--r--youtube_dl/extractor/dailymail.py61
-rw-r--r--youtube_dl/extractor/extractors.py7
-rw-r--r--youtube_dl/extractor/fczenit.py33
-rw-r--r--youtube_dl/extractor/kuwo.py1
-rw-r--r--youtube_dl/extractor/periscope.py37
-rw-r--r--youtube_dl/extractor/redtube.py58
-rw-r--r--youtube_dl/extractor/vevo.py5
-rw-r--r--youtube_dl/extractor/vk.py46
-rw-r--r--youtube_dl/extractor/youtube.py4
-rw-r--r--youtube_dl/utils.py15
11 files changed, 252 insertions, 54 deletions
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py
new file mode 100644
index 000000000..ae4579b33
--- /dev/null
+++ b/youtube_dl/extractor/biqle.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BIQLEIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
+ _TESTS = [{
+ 'url': 'http://www.biqle.ru/watch/847655_160197695',
+ 'md5': 'ad5f746a874ccded7b8f211aeea96637',
+ 'info_dict': {
+ 'id': '160197695',
+ 'ext': 'mp4',
+ 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)',
+ 'uploader': 'Andrey Rogozin',
+ 'upload_date': '20110605',
+ }
+ }, {
+ 'url': 'https://biqle.org/watch/-44781847_168547604',
+ 'md5': '7f24e72af1db0edf7c1aaba513174f97',
+ 'info_dict': {
+ 'id': '168547604',
+ 'ext': 'mp4',
+ 'title': 'Ребенок в шоке от автоматической мойки',
+ 'uploader': 'Dmitry Kotov',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._proto_relative_url(self._search_regex(
+ r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ }
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
new file mode 100644
index 000000000..b60a1d813
--- /dev/null
+++ b/youtube_dl/extractor/dailymail.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ determine_protocol,
+)
+
+
+class DailyMailIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
+ 'md5': '2f639d446394f53f3a33658b518b6615',
+ 'info_dict': {
+ 'id': '1288527',
+ 'ext': 'mp4',
+ 'title': 'Turn any video into an impressionist masterpiece',
+ 'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_data = self._parse_json(self._search_regex(
+ r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+ title = video_data['title']
+ video_sources = self._download_json(video_data.get(
+ 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+ formats = []
+ for rendition in video_sources['renditions']:
+ rendition_url = rendition.get('url')
+ if not rendition_url:
+ continue
+ tbr = int_or_none(rendition.get('encodingRate'), 1000)
+ container = rendition.get('videoContainer')
+ is_hls = container == 'M2TS'
+ protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+ formats.append({
+ 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+ 'url': rendition_url,
+ 'width': int_or_none(rendition.get('frameWidth')),
+ 'height': int_or_none(rendition.get('frameHeight')),
+ 'tbr': tbr,
+ 'vcodec': rendition.get('videoCodec'),
+ 'container': container,
+ 'protocol': protocol,
+ 'ext': 'mp4' if is_hls else None,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('descr'),
+ 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index ef4431364..14b4f245f 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -75,6 +75,7 @@ from .bigflix import BigflixIE
from .bild import BildIE
from .bilibili import BiliBiliIE
from .biobiochiletv import BioBioChileTVIE
+from .biqle import BIQLEIE
from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
@@ -157,6 +158,7 @@ from .cspan import CSpanIE
from .ctsnews import CtsNewsIE
from .cultureunplugged import CultureUnpluggedIE
from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
@@ -560,7 +562,10 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
from .people import PeopleIE
-from .periscope import PeriscopeIE
+from .periscope import (
+ PeriscopeIE,
+ PeriscopeUserIE,
+)
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py
index f1f150ef2..8d1010b88 100644
--- a/youtube_dl/extractor/fczenit.py
+++ b/youtube_dl/extractor/fczenit.py
@@ -1,20 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
class FczenitIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://fc-zenit.ru/video/gl6785/',
- 'md5': '458bacc24549173fe5a5aa29174a5606',
+ 'url': 'http://fc-zenit.ru/video/41044/',
+ 'md5': '0e3fab421b455e970fa1aa3891e57df0',
'info_dict': {
- 'id': '6785',
+ 'id': '41044',
'ext': 'mp4',
- 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+ 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
},
}
@@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+ video_title = self._html_search_regex(
+ r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+ video_items = self._parse_json(self._search_regex(
+ r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
+ video_id)
- bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
- bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+ def merge_dicts(*dicts):
+ ret = {}
+ for a_dict in dicts:
+ ret.update(a_dict)
+ return ret
formats = [{
- 'url': furl,
- 'tbr': tbr,
- } for furl, tbr in bitrates]
+ 'url': compat_urlparse.urljoin(url, video_url),
+ 'tbr': int(tbr),
+ } for tbr, video_url in merge_dicts(*video_items).items()]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index 616ed19e1..11b31a699 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor):
'info_dict': {
'id': '86375',
'title': '八十年代精选',
+ 'description': '这些都是属于八十年代的回忆!',
},
'playlist_mincount': 24,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 514e9b433..0a4bc761d 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -7,6 +7,7 @@ from ..utils import parse_iso8601
class PeriscopeIE(InfoExtractor):
IE_DESC = 'Periscope'
+ IE_NAME = 'periscope'
_VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
# Alive example URLs can be found here http://onperiscope.com/
_TESTS = [{
@@ -79,3 +80,39 @@ class PeriscopeIE(InfoExtractor):
'thumbnails': thumbnails,
'formats': formats,
}
+
+
+class PeriscopeUserIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$'
+ IE_DESC = 'Periscope user videos'
+ IE_NAME = 'periscope:user'
+
+ _TEST = {
+ 'url': 'https://www.periscope.tv/LularoeHusbandMike/',
+ 'info_dict': {
+ 'id': 'LularoeHusbandMike',
+ 'title': 'LULAROE HUSBAND MIKE',
+ },
+ # Periscope only shows videos in the last 24 hours, so it's possible to
+ # get 0 videos
+ 'playlist_mincount': 0,
+ }
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, user_id)
+
+ broadcast_data = self._parse_json(self._html_search_meta(
+ 'broadcast-data', webpage, default='{}'), user_id)
+ username = broadcast_data.get('user', {}).get('display_name')
+ user_broadcasts = self._parse_json(
+ self._html_search_meta('user-broadcasts', webpage, default='{}'),
+ user_id)
+
+ entries = [
+ self.url_result(
+ 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
+ for broadcast in user_broadcasts.get('broadcasts', [])]
+
+ return self.playlist_result(entries, user_id, username)
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index 7ba41ba59..721fc3a9e 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -1,7 +1,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ str_to_int,
+ unified_strdate,
+)
class RedTubeIE(InfoExtractor):
@@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
'id': '66418',
'ext': 'mp4',
'title': 'Sucked on a toilet',
+ 'upload_date': '20120831',
+ 'duration': 596,
+ 'view_count': int,
'age_limit': 18,
}
}
@@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
raise ExtractorError('Video %s has been removed' % video_id, expected=True)
- video_url = self._html_search_regex(
- r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
- video_title = self._html_search_regex(
- r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
- webpage, 'title')
- video_thumbnail = self._og_search_thumbnail(webpage)
+ title = self._html_search_regex(
+ (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
+ r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
+ webpage, 'title', group='title')
+
+ formats = []
+ sources = self._parse_json(
+ self._search_regex(
+ r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+ video_id, fatal=False)
+ if sources and isinstance(sources, dict):
+ for format_id, format_url in sources.items():
+ if format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': int_or_none(format_id),
+ })
+ else:
+ video_url = self._html_search_regex(
+ r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+ formats.append({'url': video_url})
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
+ webpage, 'upload date', fatal=False))
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+ view_count = str_to_int(self._search_regex(
+ r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
+ webpage, 'view count', fatal=False))
# No self-labeling, but they describe themselves as
# "Home of Videos Porno"
@@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': video_thumbnail,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
'age_limit': age_limit,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index c0ef08c02..c0632cd6a 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -203,7 +203,8 @@ class VevoIE(VevoBaseIE):
json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
response = self._download_json(
- json_url, video_id, 'Downloading video info', 'Unable to download info')
+ json_url, video_id, 'Downloading video info',
+ 'Unable to download info', fatal=False) or {}
video_info = response.get('video') or {}
artist = None
featured_artist = None
@@ -212,7 +213,7 @@ class VevoIE(VevoBaseIE):
formats = []
if not video_info:
- if response.get('statusCode') != 909:
+ if response and response.get('statusCode') != 909:
ytid = response.get('errorInfo', {}).get('ytid')
if ytid:
self.report_warning(
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 67220f1b7..041d93629 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -26,12 +26,16 @@ class VKIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+ (?:
+ (?:m\.)?vk\.com/video_|
+ (?:www\.)?daxab.com/
+ )
+ ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?:
(?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
- (?:www\.)?biqle\.ru/watch/
+ (?:www\.)?daxab.com/embed/
)
- (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+ (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
)
'''
_NETRC_MACHINE = 'vk'
@@ -75,7 +79,8 @@ class VKIE(InfoExtractor):
'duration': 101,
'upload_date': '20120730',
'view_count': int,
- }
+ },
+ 'skip': 'This video has been removed from public access.',
},
{
# VIDEO NOW REMOVED
@@ -142,7 +147,7 @@ class VKIE(InfoExtractor):
'id': 'V3K4mi0SYkc',
'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
- 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+ 'description': 'md5:d9903938abdc74c738af77f527ca0596',
'duration': 178,
'upload_date': '20130116',
'uploader': "Children's Joy Foundation",
@@ -174,11 +179,6 @@ class VKIE(InfoExtractor):
'only_matching': True,
},
{
- # vk wrapper
- 'url': 'http://www.biqle.ru/watch/847655_160197695',
- 'only_matching': True,
- },
- {
# pladform embed
'url': 'https://vk.com/video-76116461_171554880',
'only_matching': True,
@@ -217,20 +217,22 @@ class VKIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- if not video_id:
+ info_url = url
+ if video_id:
+ info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+ # Some videos (removed?) can only be downloaded with list id specified
+ list_id = mobj.group('list_id')
+ if list_id:
+ info_url += '&list=%s' % list_id
+ else:
+ info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
- info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
-
- # Some videos (removed?) can only be downloaded with list id specified
- list_id = mobj.group('list_id')
- if list_id:
- info_url += '&list=%s' % list_id
-
info_page = self._download_webpage(info_url, video_id)
error_message = self._html_search_regex(
- r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None)
if error_message:
raise ExtractorError(error_message, expected=True)
@@ -305,17 +307,17 @@ class VKIE(InfoExtractor):
view_count = None
views = self._html_search_regex(
r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
- info_page, 'view count', fatal=False)
+ info_page, 'view count', default=None)
if views:
view_count = str_to_int(self._search_regex(
r'([\d,.]+)', views, 'view count', fatal=False))
formats = []
for k, v in data.items():
- if not k.startswith('url') and k != 'extra_data' or not v:
+ if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:
continue
height = int_or_none(self._search_regex(
- r'^url(\d+)', k, 'height', default=None))
+ r'^(?:url|cache)(\d+)', k, 'height', default=None))
formats.append({
'format_id': k,
'url': v,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b7c3cb63f..f3f102c30 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1326,9 +1326,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if video_description:
video_description = re.sub(r'''(?x)
<a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
[^<]+\.{3}\s*
</a>
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 7bcc85e2b..6e4573784 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -14,8 +14,8 @@ import email.utils
import errno
import functools
import gzip
-import itertools
import io
+import itertools
import json
import locale
import math
@@ -24,8 +24,8 @@ import os
import pipes
import platform
import re
-import ssl
import socket
+import ssl
import struct
import subprocess
import sys
@@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = (
'wav',
'f4f', 'f4m', 'm3u8', 'smil')
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
+ itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
+ 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
+
def preferredencoding():
"""Get preferred encoding.
@@ -251,9 +256,9 @@ def get_element_by_attribute(attribute, value, html):
m = re.search(r'''(?xs)
<([a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
\s+%s=['"]?%s['"]?
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
\s*>
(?P<content>.*?)
</\1>
@@ -365,6 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
"""
def replace_insane(char):
+ if restricted and char in ACCENT_CHARS:
+ return ACCENT_CHARS[char]
if char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':