diff options
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 54 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 13 | ||||
-rw-r--r-- | youtube_dl/extractor/abc7news.py | 68 | ||||
-rw-r--r-- | youtube_dl/extractor/cnn.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/hearthisat.py | 117 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/ndtv.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/streetvoice.py | 51 | ||||
-rw-r--r-- | youtube_dl/extractor/testtube.py | 60 | ||||
-rw-r--r-- | youtube_dl/extractor/tinypic.py | 26 | ||||
-rw-r--r-- | youtube_dl/extractor/tvp.py | 37 | ||||
-rw-r--r-- | youtube_dl/extractor/twitch.py | 430 | ||||
-rw-r--r-- | youtube_dl/extractor/videomega.py | 10 | ||||
-rw-r--r-- | youtube_dl/options.py | 13 | ||||
-rw-r--r-- | youtube_dl/utils.py | 28 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
19 files changed, 754 insertions, 212 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 772fddd45..8ef74e414 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -10,6 +10,7 @@ import io import itertools import json import locale +import operator import os import platform import re @@ -49,6 +50,7 @@ from .utils import ( make_HTTPS_handler, MaxDownloadsReached, PagedList, + parse_filesize, PostProcessingError, platform_name, preferredencoding, @@ -768,7 +770,59 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def _apply_format_filter(self, format_spec, available_formats): + " Returns a tuple of the remaining format_spec and filtered formats " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s*\[ + (?P<key>width|height|tbr|abr|vbr|filesize) + \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) + \]$ + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.search(format_spec) + if not m: + raise ValueError('Invalid format specification %r' % format_spec) + + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), format_spec)) + op = OPERATORS[m.group('op')] + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + new_formats = [f for f in available_formats if _filter(f)] + + new_format_spec = format_spec[:-len(m.group(0))] + if not new_format_spec: + new_format_spec = 'best' + + return (new_format_spec, new_formats) + def select_format(self, format_spec, available_formats): + while format_spec.endswith(']'): + format_spec, available_formats = self._apply_format_filter( + format_spec, available_formats) + if not available_formats: + return None + if format_spec == 'best' or format_spec is None: return available_formats[-1] elif format_spec == 'worst': diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0902eb437..9ab90ac62 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .abc import ABCIE +from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .adobetv import AdobeTVIE @@ -175,6 +176,7 @@ from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .groupon import GrouponIE from .hark import HarkIE +from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE @@ -408,6 +410,7 @@ from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE @@ -429,6 +432,7 @@ from .telemb import TeleMBIE from .teletask import TeleTaskIE from .tenplay import TenPlayIE from .testurl import TestURLIE +from .testtube import TestTubeIE from .tf1 import TF1IE from .theonion import TheOnionIE from .theplatform import ThePlatformIE @@ -457,7 +461,14 @@ from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .twentyfourvideo import TwentyFourVideoIE -from .twitch import TwitchIE +from .twitch import ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchPastBroadcastsIE, + TwitchStreamIE, +) from .ubu import UbuIE from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py new file mode 100644 index 000000000..c04949c21 --- /dev/null +++ b/youtube_dl/extractor/abc7news.py @@ -0,0 +1,68 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_iso8601 + + +class Abc7NewsIE(InfoExtractor): + _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', + 'info_dict': { + 'id': '472581', + 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', + 'ext': 'mp4', + 'title': 'East Bay museum celebrates history of synthesized music', + 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1421123075, + 'upload_date': '20150113', + 'uploader': 'Jonathan Bloom', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://abc7news.com/472581', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + m3u8 = self._html_search_meta( + 'contentURL', webpage, 'm3u8 url', fatal=True) + + formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + thumbnail = self._og_search_thumbnail(webpage) + timestamp = parse_iso8601(self._search_regex( + r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">', + webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'rel="author">([^<]+)</a>', + webpage, 'uploader', default=None) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 93e8d0de3..90ea07438 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -51,7 +51,7 @@ class CNNIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) path = mobj.group('path') page_title = mobj.group('title') - info_url = 'http://cnn.com/video/data/3.0/%s/index.xml' % path + info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path info = self._download_xml(info_url, page_title) formats = [] @@ -143,13 +143,13 @@ class CNNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', - 'md5': '275b326f85d80dff7592a9820f5dc887', + 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', 'info_dict': { - 'id': 'bestoftv/2014/12/21/sotu-crowley-president-obama-north-korea-not-going-to-be-intimidated.cnn', + 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', - 'title': 'Obama: We\'re not going to be intimidated', - 'description': 'md5:e735586f3dc936075fa654a4d91b21f9', - 'upload_date': '20141220', + 'title': 'Obama: Cyberattack not an act of war', + 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5', + 'upload_date': '20141221', }, 'add_ie': ['CNN'], } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 8d27af5e5..15ca361f0 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -34,12 +34,12 @@ class ComedyCentralIE(MTVServicesInfoExtractor): class ComedyCentralShowsIE(MTVServicesInfoExtractor): IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow or :colbert + # urls can be abbreviations like :thedailyshow # urls for episodes like: # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) + _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) |https?://(:www\.)? (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a5bf9392..b893d8149 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + is_html, orderedSet, parse_xml, smuggle_url, @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor): # Maybe it's a direct link to a video? # Be careful not to download the whole thing! first_bytes = full_response.read(512) - if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): + if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') upload_date = unified_strdate( diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py new file mode 100644 index 000000000..a19b31ac0 --- /dev/null +++ b/youtube_dl/extractor/hearthisat.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + HEADRequest, + str_to_int, + urlencode_postdata, + urlhandle_detect_ext, +) + + +class HearThisAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$' + _PLAYLIST_URL = 'https://hearthis.at/playlist.php' + _TEST = { + 'url': 'https://hearthis.at/moofi/dr-kreep', + 'md5': 'ab6ec33c8fed6556029337c7885eb4e0', + 'info_dict': { + 'id': '150939', + 'ext': 'wav', + 'title': 'Moofi - Dr. Kreep', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1421564134, + 'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.', + 'upload_date': '20150118', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 71, + 'categories': ['Experimental'], + } + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + display_id = '{artist:s} - {title:s}'.format(**m.groupdict()) + + webpage = self._download_webpage(url, display_id) + track_id = self._search_regex( + r'intTrackId\s*=\s*(\d+)', webpage, 'track ID') + + payload = urlencode_postdata({'tracks[]': track_id}) + req = compat_urllib_request.Request(self._PLAYLIST_URL, payload) + req.add_header('Content-type', 'application/x-www-form-urlencoded') + + track = self._download_json(req, track_id, 'Downloading playlist')[0] + title = '{artist:s} - {title:s}'.format(**track) + + categories = None + if track.get('category'): + categories = [track['category']] + + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>' + view_count = str_to_int(self._search_regex( + meta_span % 'plays_count', webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + meta_span % 'likes_count', webpage, 'like count', fatal=False)) + comment_count = str_to_int(self._search_regex( + meta_span % 'comment_count', webpage, 'comment count', fatal=False)) + duration = str_to_int(self._search_regex( + r'data-length="(\d+)', webpage, 'duration', fatal=False)) + timestamp = str_to_int(self._search_regex( + r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False)) + + formats = [] + mp3_url = self._search_regex( + r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"', + webpage, 'mp3 URL', fatal=False) + if mp3_url: + formats.append({ + 'format_id': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'url': mp3_url, + }) + download_path = self._search_regex( + r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"', + webpage, 'download URL', default=None) + if download_path: + download_url = compat_urlparse.urljoin(url, download_path) + ext_req = HEADRequest(download_url) + ext_handle = self._request_webpage( + ext_req, display_id, note='Determining extension') + ext = urlhandle_detect_ext(ext_handle) + formats.append({ + 'format_id': 'download', + 'vcodec': 'none', + 'ext': ext, + 'url': download_url, + 'preference': 2, # Usually better quality + }) + self._sort_formats(formats) + + return { + 'id': track_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'like_count': like_count, + 'categories': categories, + } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 690c46b6a..f840f6532 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..compat import ( compat_str, + compat_HTTPError, ) from ..utils import ( ExtractorError, @@ -78,6 +79,16 @@ class NBCNewsIE(InfoExtractor): }, 'add_ie': ['ThePlatform'], }, + { + 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', + 'md5': 'fdbf39ab73a72df5896b6234ff98518a', + 'info_dict': { + 'id': 'Wjf9EDR3A_60', + 'ext': 'mp4', + 'title': 'FULL EPISODE: Family Business', + 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', + }, + }, ] def _real_extract(self, url): @@ -115,10 +126,19 @@ class NBCNewsIE(InfoExtractor): if not base_url: continue playlist_url = base_url + '?form=MPXNBCNewsAPI' - all_videos = self._download_json(playlist_url, title)['videos'] try: - info = next(v for v in all_videos if v['mpxId'] == mpxid) + all_videos = self._download_json(playlist_url, title) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError): + continue + raise + + if not all_videos or 'videos' not in all_videos: + continue + + try: + info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) break except StopIteration: continue diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 95e7d63aa..2a1ca80df 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -27,9 +27,7 @@ class NDTVIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) filename = self._search_regex( diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 634142d0d..fb2032832 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -10,6 +10,7 @@ from ..compat import ( compat_urllib_request, ) from ..utils import ( + ExtractorError, str_to_int, ) from ..aes import ( @@ -44,6 +45,15 @@ class PornHubIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) + error_msg = self._html_search_regex( + r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', + webpage, 'error message', default=None) + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<', diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py new file mode 100644 index 000000000..6a57fa60a --- /dev/null +++ b/youtube_dl/extractor/streetvoice.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import unified_strdate + + +class StreetVoiceIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://streetvoice.com/skippylu/songs/94440/', + 'md5': '15974627fc01a29e492c98593c2fd472', + 'info_dict': { + 'id': '94440', + 'ext': 'mp3', + 'filesize': 4167053, + 'title': '輸', + 'description': 'Crispy脆樂團 - 輸', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 260, + 'upload_date': '20091018', + 'uploader': 'Crispy脆樂團', + 'uploader_id': '627810', + } + }, { + 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', + 'only_matching': True, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + song = self._download_json( + 'http://streetvoice.com/music/api/song/%s' % song_id, song_id) + + title = song['name'] + author = song['musician']['name'] + + return { + 'id': song_id, + 'url': song['file'], + 'filesize': song.get('size'), + 'title': title, + 'description': '%s - %s' % (author, title), + 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), + 'duration': song.get('length'), + 'upload_date': unified_strdate(song.get('created_at')), + 'uploader': author, + 'uploader_id': compat_str(song['musician']['id']), + } diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py new file mode 100644 index 000000000..fd47e71a2 --- /dev/null +++ b/youtube_dl/extractor/testtube.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TestTubeIE(InfoExtractor): + _VALID_URL = r'https?://testtube\.com/[^/?#]+/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', + 'info_dict': { + 'id': '60163', + 'display_id': '5-weird-ways-plants-can-eat-animals', + 'duration': 275, + 'ext': 'mp4', + 'title': '5 Weird Ways Plants Can Eat Animals', + 'description': 'Why have some plants evolved to eat meat?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'DNews', + 'uploader_id': 'dnews', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);", + webpage, 'video ID') + + all_info = self._download_json( + 'https://testtube.com/api/getPlaylist.json?api_key=ba9c741bce1b9d8e3defcc22193f3651b8867e62&codecs=h264,vp8,theora&video_id=%s' % video_id, + video_id) + info = all_info['items'][0] + + formats = [] + for vcodec, fdatas in info['media'].items(): + for name, fdata in fdatas.items(): + formats.append({ + 'format_id': '%s-%s' % (vcodec, name), + 'url': fdata['url'], + 'vcodec': vcodec, + 'tbr': fdata.get('bitrate'), + }) + self._sort_formats(formats) + + duration = int_or_none(info.get('duration')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': info['title'], + 'description': info.get('summary'), + 'thumbnail': info.get('images', {}).get('large'), + 'uploader': info.get('show', {}).get('name'), + 'uploader_id': info.get('show', {}).get('slug'), + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index 4fe89dbe5..e036b8cdf 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -9,17 +9,23 @@ from ..utils import ExtractorError class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' - - _TEST = { - 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', - 'md5': '609b74432465364e72727ebc6203f044', - 'info_dict': { - 'id': '6xw7tc', - 'ext': 'flv', - 'title': 'shadow phenomenon weird', + _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + + _TESTS = [ + { + 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', + 'md5': '609b74432465364e72727ebc6203f044', + 'info_dict': { + 'id': '6xw7tc', + 'ext': 'flv', + 'title': 'shadow phenomenon weird', + }, + }, + { + 'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8', + 'only_matching': True, } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index cc26f417a..f57d609d4 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -12,61 +12,59 @@ class TvpIE(InfoExtractor): _TESTS = [{ 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035', + 'md5': 'cdd98303338b8a7f7abab5cd14092bf2', 'info_dict': { 'id': '4278035', 'ext': 'wmv', 'title': 'Ogniem i mieczem, odc. 2', - 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.', }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', + 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', - # 'description': 'WŁADEK\nCzesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna? Karol odwiedza Wandę przyznaje się, że ją oszukiwał, ale ostrzega też, że grozi jej aresztowanie i nalega, żeby wyjechała z Warszawy. Czy dziewczyna zdecyduje się znów oddalić od ukochanego? Rozpoczyna się akcja odbicia Władka.', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', + 'md5': 'c3b15ed1af288131115ff17a17c19dda', 'info_dict': { 'id': '17916176', 'ext': 'mp4', 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', }, - 'params': { - # m3u8 download - 'skip_download': 'true', - }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', + 'md5': 'c3b15ed1af288131115ff17a17c19dda', 'info_dict': { 'id': '17834272', 'ext': 'mp4', 'title': 'Na sygnale, odc. 39', - 'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…', - }, - 'params': { - # m3u8 download - 'skip_download': 'true', }, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - title = self._og_search_title(webpage) - series = self._search_regex( - r'{name:\s*([\'"])SeriesTitle\1,\s*value:\s*\1(?P<series>.*?)\1},', + title = self._search_regex( + r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', + webpage, 'title', group='title') + series_title = self._search_regex( + r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', webpage, 'series', group='series', default=None) - if series is not None and series not in title: - title = '%s, %s' % (series, title) - description = self._og_search_description(webpage, default=None) + if series_title: + title = '%s, %s' % (series_title, title) + + thumbnail = self._search_regex( + r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) video_url = self._search_regex( r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if video_url is None: + if not video_url: video_url = self._download_json( 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, video_id)['video_url'] @@ -89,8 +87,7 @@ class TvpIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': description, + 'thumbnail': thumbnail, 'formats': formats, } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b11a1d561..340cadcf5 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -3,9 +3,11 @@ from __future__ import unicode_literals import itertools import re +import random from .common import InfoExtractor from ..compat import ( + compat_str, compat_urllib_parse, compat_urllib_request, ) @@ -15,44 +17,12 @@ from ..utils import ( ) -class TwitchIE(InfoExtractor): - # TODO: One broadcast may be split into multiple videos. The key - # 'broadcast_id' is the same for all parts, and 'broadcast_part' - # starts at 1 and increases. Can we treat all parts as one video? - _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/ - (?: - (?P<channelid>[^/]+)| - (?:(?:[^/]+)/v/(?P<vodid>[^/]+))| - (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| - (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) - ) - /?(?:\#.*)?$ - """ - _PAGE_LIMIT = 100 +class TwitchBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _API_BASE = 'https://api.twitch.tv' + _USHER_BASE = 'http://usher.twitch.tv' _LOGIN_URL = 'https://secure.twitch.tv/user/login' - _TESTS = [{ - 'url': 'http://www.twitch.tv/riotgames/b/577357806', - 'info_dict': { - 'id': 'a577357806', - 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', - }, - 'playlist_mincount': 12, - }, { - 'url': 'http://www.twitch.tv/acracingleague/c/5285812', - 'info_dict': { - 'id': 'c5285812', - 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', - }, - 'playlist_mincount': 3, - }, { - 'url': 'http://www.twitch.tv/vanillatv', - 'info_dict': { - 'id': 'vanillatv', - 'title': 'VanillaTV', - }, - 'playlist_mincount': 412, - }] def _handle_error(self, response): if not isinstance(response, dict): @@ -64,34 +34,60 @@ class TwitchIE(InfoExtractor): expected=True) def _download_json(self, url, video_id, note='Downloading JSON metadata'): - response = super(TwitchIE, self)._download_json(url, video_id, note) + response = super(TwitchBaseIE, self)._download_json(url, video_id, note) self._handle_error(response) return response - def _extract_media(self, item, item_id): - ITEMS = { - 'a': 'video', - 'v': 'vod', - 'c': 'chapter', + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + authenticity_token = self._search_regex( + r'<input name="authenticity_token" type="hidden" value="([^"]+)"', + login_page, 'authenticity token') + + login_form = { + 'utf8': '✓'.encode('utf-8'), + 'authenticity_token': authenticity_token, + 'redirect_on_login': '', + 'embed_form': 'false', + 'mp_source_action': '', + 'follow': '', + 'user[login]': username, + 'user[password]': password, } - info = self._extract_info(self._download_json( + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + request.add_header('Referer', self._LOGIN_URL) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + m = re.search( + r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response) + if m: + raise ExtractorError( + 'Unable to login: %s' % m.group('msg').strip(), expected=True) + + +class TwitchItemBaseIE(TwitchBaseIE): + def _download_info(self, item, item_id): + return self._extract_info(self._download_json( '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, - 'Downloading %s info JSON' % ITEMS[item])) - - if item == 'v': - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, - 'Downloading %s access token' % ITEMS[item]) - formats = self._extract_m3u8_formats( - 'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s' - % (item_id, access_token['token'], access_token['sig']), - item_id, 'mp4') - info['formats'] = formats - return info + 'Downloading %s info JSON' % self._ITEM_TYPE)) + def _extract_media(self, item_id): + info = self._download_info(self._ITEM_SHORTCUT, item_id) response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id, - 'Downloading %s playlist JSON' % ITEMS[item]) + '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] qualities = list(chunks.keys()) @@ -129,119 +125,227 @@ class TwitchIE(InfoExtractor): 'view_count': info['views'], } - def _real_initialize(self): - self._login() + def _real_extract(self, url): + return self._extract_media(self._match_id(url)) - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') +class TwitchVideoIE(TwitchItemBaseIE): + IE_NAME = 'twitch:video' + _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _ITEM_TYPE = 'video' + _ITEM_SHORTCUT = 'a' - authenticity_token = self._search_regex( - r'<input name="authenticity_token" type="hidden" value="([^"]+)"', - login_page, 'authenticity token') + _TEST = { + 'url': 'http://www.twitch.tv/riotgames/b/577357806', + 'info_dict': { + 'id': 'a577357806', + 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', + }, + 'playlist_mincount': 12, + } - login_form = { - 'utf8': '✓'.encode('utf-8'), - 'authenticity_token': authenticity_token, - 'redirect_on_login': '', - 'embed_form': 'false', - 'mp_source_action': '', - 'follow': '', - 'user[login]': username, - 'user[password]': password, - } - request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - request.add_header('Referer', self._LOGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) +class TwitchChapterIE(TwitchItemBaseIE): + IE_NAME = 'twitch:chapter' + _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _ITEM_TYPE = 'chapter' + _ITEM_SHORTCUT = 'c' - m = re.search( - r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response) - if m: - raise ExtractorError( - 'Unable to login: %s' % m.group('msg').strip(), expected=True) + _TESTS = [{ + 'url': 'http://www.twitch.tv/acracingleague/c/5285812', + 'info_dict': { + 'id': 'c5285812', + 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361', + 'only_matching': True, + }] + + +class TwitchVodIE(TwitchItemBaseIE): + IE_NAME = 'twitch:vod' + _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _ITEM_TYPE = 'vod' + _ITEM_SHORTCUT = 'v' + + _TEST = { + 'url': 'http://www.twitch.tv/ksptv/v/3622000', + 'info_dict': { + 'id': 'v3622000', + 'ext': 'mp4', + 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 6951, + 'timestamp': 1419028564, + 'upload_date': '20141219', + 'uploader': 'KSPTV', + 'uploader_id': 'ksptv', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + item_id = self._match_id(url) + info = self._download_info(self._ITEM_SHORTCUT, item_id) + access_token = self._download_json( + '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + 'Downloading %s access token' % self._ITEM_TYPE) + formats = self._extract_m3u8_formats( + '%s/vod/%s?nauth=%s&nauthsig=%s' + % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), + item_id, 'mp4') + info['formats'] = formats + return info + + +class TwitchPlaylistBaseIE(TwitchBaseIE): + _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PAGE_LIMIT = 100 + + def _extract_playlist(self, channel_id): + info = self._download_json( + '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + channel_id, 'Downloading channel info JSON') + channel_name = info.get('display_name') or info.get('name') + entries = [] + offset = 0 + limit = self._PAGE_LIMIT + for counter in itertools.count(1): + response = self._download_json( + self._PLAYLIST_URL % (channel_id, offset, limit), + channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) + videos = response['videos'] + if not videos: + break + entries.extend([self.url_result(video['url']) for video in videos]) + offset += limit + return self.playlist_result(entries, channel_id, channel_name) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class TwitchProfileIE(TwitchPlaylistBaseIE): + IE_NAME = 'twitch:profile' + _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _PLAYLIST_TYPE = 'profile' + + _TEST = { + 'url': 'http://www.twitch.tv/vanillatv/profile', + 'info_dict': { + 'id': 'vanillatv', + 'title': 'VanillaTV', + }, + 'playlist_mincount': 412, + } + + +class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): + IE_NAME = 'twitch:past_broadcasts' + _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' + _PLAYLIST_TYPE = 'past broadcasts' + + _TEST = { + 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 54, + } + + +class TwitchStreamIE(TwitchBaseIE): + IE_NAME = 'twitch:stream' + _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + + _TEST = { + 'url': 'http://www.twitch.tv/shroomztv', + 'info_dict': { + 'id': '12772022048', + 'display_id': 'shroomztv', + 'ext': 'mp4', + 'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV', + 'is_live': True, + 'timestamp': 1421928037, + 'upload_date': '20150122', + 'uploader': 'ShroomzTV', + 'uploader_id': 'shroomztv', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj.group('chapterid'): - return self._extract_media('c', mobj.group('chapterid')) + channel_id = self._match_id(url) + + stream = self._download_json( + '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + 'Downloading stream JSON').get('stream') + + # Fallback on profile extraction if stream is offline + if not stream: + return self.url_result( + 'http://www.twitch.tv/%s/profile' % channel_id, + 'TwitchProfile', channel_id) - """ - webpage = self._download_webpage(url, chapter_id) - m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) + access_token = self._download_json( + '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + 'Downloading channel access token') + + query = { + 'allow_source': 'true', + 'p': random.randint(1000000, 10000000), + 'player': 'twitchweb', + 'segment_preference': '4', + 'sig': access_token['sig'], + 'token': access_token['token'], + } + + formats = self._extract_m3u8_formats( + '%s/api/channel/hls/%s.m3u8?%s' + % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), + channel_id, 'mp4') + + view_count = stream.get('viewers') + timestamp = parse_iso8601(stream.get('created_at')) + + channel = stream['channel'] + title = self._live_title(channel.get('display_name') or channel.get('name')) + description = channel.get('status') + + thumbnails = [] + for thumbnail_key, thumbnail_url in stream['preview'].items(): + m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key) if not m: - raise ExtractorError('Cannot find archive of a chapter') - archive_id = m.group(1) - - api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id - doc = self._download_xml( - api, chapter_id, - note='Downloading chapter information', - errnote='Chapter information download failed') - for a in doc.findall('.//archive'): - if archive_id == a.find('./id').text: - break - else: - raise ExtractorError('Could not find chapter in chapter information') - - video_url = a.find('./video_file_url').text - video_ext = video_url.rpartition('.')[2] or 'flv' - - chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id - chapter_info = self._download_json( - chapter_api_url, 'c' + chapter_id, - note='Downloading chapter metadata', - errnote='Download of chapter metadata failed') - - bracket_start = int(doc.find('.//bracket_start').text) - bracket_end = int(doc.find('.//bracket_end').text) - - # TODO determine start (and probably fix up file) - # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 - #video_url += '?start=' + TODO:start_timestamp - # bracket_start is 13290, but we want 51670615 - self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' - 'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) - - info = { - 'id': 'c' + chapter_id, - 'url': video_url, - 'ext': video_ext, - 'title': chapter_info['title'], - 'thumbnail': chapter_info['preview'], - 'description': chapter_info['description'], - 'uploader': chapter_info['channel']['display_name'], - 'uploader_id': chapter_info['channel']['name'], - } - return info - """ - elif mobj.group('videoid'): - return self._extract_media('a', mobj.group('videoid')) - elif mobj.group('vodid'): - return self._extract_media('v', mobj.group('vodid')) - elif mobj.group('channelid'): - channel_id = mobj.group('channelid') - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), - channel_id, 'Downloading channel info JSON') - channel_name = info.get('display_name') or info.get('name') - entries = [] - offset = 0 - limit = self._PAGE_LIMIT - for counter in itertools.count(1): - response = self._download_json( - '%s/kraken/channels/%s/videos/?offset=%d&limit=%d' - % (self._API_BASE, channel_id, offset, limit), - channel_id, 'Downloading channel videos JSON page %d' % counter) - videos = response['videos'] - if not videos: - break - entries.extend([self.url_result(video['url'], 'Twitch') for video in videos]) - offset += limit - return self.playlist_result(entries, channel_id, channel_name) + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + + return { + 'id': compat_str(stream['_id']), + 'display_id': channel_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'uploader': channel.get('display_name'), + 'uploader_id': channel.get('name'), + 'timestamp': timestamp, + 'view_count': view_count, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 9fc64d172..fc6e05fe0 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -1,12 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, ) from ..utils import ( + ExtractorError, remove_start, ) @@ -35,8 +38,11 @@ class VideoMegaIE(InfoExtractor): req.add_header('Referer', url) webpage = self._download_webpage(req, video_id) - escaped_data = self._search_regex( - r'unescape\("([^"]+)"\)', webpage, 'escaped data') + try: + escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] + except IndexError: + raise ExtractorError('Unable to extract escaped data') + playlist = compat_urllib_parse.unquote(escaped_data) thumbnail = self._search_regex( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index a30974efd..fd7b400b2 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -264,7 +264,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '-p', '--password', dest='password', metavar='PASSWORD', - help='account password') + help='account password. If this option is left out, youtube-dl will ask interactively.') authentication.add_option( '-2', '--twofactor', dest='twofactor', metavar='TWOFACTOR', @@ -289,6 +289,17 @@ def parseOpts(overrideArguments=None): 'extensions aac, m4a, mp3, mp4, ogg, wav, webm. ' 'You can also use the special names "best",' ' "bestvideo", "bestaudio", "worst". ' + ' You can filter the video results by putting a condition in' + ' brackets, as in -f "best[height=720]"' + ' (or -f "[filesize>10M]"). ' + ' This works for filesize, height, width, tbr, abr, and vbr' + ' and the comparisons <, <=, >, >=, =, != .' + ' Formats for which the value is not known are excluded unless you' + ' put a question mark (?) after the operator.' + ' You can combine format filters, so ' + '-f "[height <=? 720][tbr>500]" ' + 'selects up to 720p videos (or videos where the height is not ' + 'known) with a bitrate of at least 500 KBit/s.' ' By default, youtube-dl will pick the best quality.' ' Use commas to download multiple audio formats, such as' ' -f 136/137/mp4/bestvideo,140/m4a/bestaudio.' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7832ed87f..b433b591b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1612,6 +1612,14 @@ def urlhandle_detect_ext(url_handle): except AttributeError: # Python < 3 getheader = url_handle.info().getheader + cd = getheader('Content-Disposition') + if cd: + m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) + if m: + e = determine_ext(m.group('filename'), default_ext=None) + if e: + return e + return getheader('Content-Type').split("/")[1] @@ -1623,3 +1631,23 @@ def age_restricted(content_limit, age_limit): if content_limit is None: return False # Content available for everyone return age_limit < content_limit + + +def is_html(first_bytes): + """ Detect whether a file contains HTML by examining its first bytes. """ + + BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), + ] + for bom, enc in BOMS: + if first_bytes.startswith(bom): + s = first_bytes[len(bom):].decode(enc, 'replace') + break + else: + s = first_bytes.decode('utf-8', 'replace') + + return re.match(r'^\s*<', s) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 63a79a7ee..28458fd69 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.01.16' +__version__ = '2015.01.23.1' |