diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/__init__.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/firedrive.py | 83 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/ndr.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/pyvideo.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/reverbnation.py | 45 | ||||
-rw-r--r-- | youtube_dl/extractor/ruhd.py | 46 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 16 | ||||
-rw-r--r-- | youtube_dl/extractor/southpark.py (renamed from youtube_dl/extractor/southparkstudios.py) | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/tlc.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/tutv.py | 21 | ||||
-rw-r--r-- | youtube_dl/jsinterp.py | 40 | ||||
-rw-r--r-- | youtube_dl/utils.py | 2 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
15 files changed, 258 insertions, 39 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 89a2cb3e8..5e16a5491 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -63,6 +63,7 @@ __authors__ = ( 'Ariset Llerena', 'Adam Malcontenti-Wilson', 'Tobias Bell', + 'Naglis Jonaitis', ) __license__ = 'Public Domain' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f75939a05..14133c315 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -232,6 +233,7 @@ from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE +from .reverbnation import ReverbNationIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE @@ -240,6 +242,7 @@ from .rtbf import RTBFIE from .rtlnow import RTLnowIE from .rts import RTSIE from .rtve import RTVEALaCartaIE +from .ruhd import RUHDIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -268,8 +271,8 @@ from .soundcloud import ( SoundcloudPlaylistIE ) from .soundgasm import SoundgasmIE -from .southparkstudios import ( - SouthParkStudiosIE, +from .southpark import ( + SouthParkIE, SouthparkDeIE, ) from .space import SpaceIE diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py new file mode 100644 index 000000000..d26145db1 --- /dev/null +++ b/youtube_dl/extractor/firedrive.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + determine_ext, +) + + +class FiredriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ + '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' + _FILE_DELETED_REGEX = r'<div class="removed_file_image">' + + _TESTS = [{ + 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', + 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', + 'info_dict': { + 'id': 'FEB892FA160EBD01', + 'ext': 'flv', + 'title': 'bbb_theora_486kbit.flv', + 'thumbnail': 're:^http://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://firedrive.com/file/%s' % video_id + + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, + expected=True) + + fields = dict(re.findall(r'''(?x)<input\s+ + type="hidden"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', webpage)) + + post = compat_urllib_parse.urlencode(fields) + req = compat_urllib_request.Request(url, post) + req.add_header('Content-type', 'application/x-www-form-urlencoded') + + # Apparently, this header is required for confirmation to work. + req.add_header('Host', 'www.firedrive.com') + + webpage = self._download_webpage(req, video_id, + 'Downloading video page') + + title = self._search_regex(r'class="external_title_left">(.+)</div>', + webpage, 'title') + thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, + 'thumbnail', fatal=False) + if thumbnail is not None: + thumbnail = 'http:' + thumbnail + + ext = self._search_regex(r'type:\s?\'([^\']+)\',', + webpage, 'extension', fatal=False) + video_url = self._search_regex( + r'file:\s?\'(http[^\']+)\',', webpage, 'file url') + + formats = [{ + 'format_id': 'sd', + 'url': video_url, + 'ext': ext, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index af9490ccc..228b42d2b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -158,6 +158,9 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid.endswith('.swf'): mgid = mgid[:-4] except RegexNotFoundError: + mgid = None + + if mgid is None or ':' not in mgid: mgid = self._search_regex( [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], webpage, u'mgid') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 3d6096e46..94d5ba982 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -18,15 +18,15 @@ class NDRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html', - 'md5': 'e7a6079ca39d3568f4996cb858dd6708', + 'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html', + 'md5': '4a4eeafd17c3058b65f0c8f091355855', 'note': 'Video file', 'info_dict': { - 'id': '7959', + 'id': '325', 'ext': 'mp4', - 'title': 'Markt - die ganze Sendung', - 'description': 'md5:af9179cf07f67c5c12dc6d9997e05725', - 'duration': 2655, + 'title': 'Blaue Bohnen aus Blocken', + 'description': 'md5:190d71ba2ccddc805ed01547718963bc', + 'duration': 1715, }, }, { diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 0bc0859b4..6d5732d45 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -46,7 +46,7 @@ class PyvideoIE(InfoExtractor): return self.url_result(m_youtube.group(1), 'Youtube') title = self._html_search_regex( - r'<div class="section">.*?<h3(?:\s+class="[^"]*")?>([^>]+?)</h3>', + r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>', webpage, 'title', flags=re.DOTALL) video_url = self._search_regex( [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'], diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py new file mode 100644 index 000000000..49cf427a1 --- /dev/null +++ b/youtube_dl/extractor/reverbnation.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..utils import strip_jsonp + + +class ReverbNationIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' + _TESTS = [{ + 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', + 'file': '16965047.mp3', + 'md5': '3da12ebca28c67c111a7f8b262d3f7a7', + 'info_dict': { + "title": "MONA LISA", + "uploader": "ALKILADOS", + "uploader_id": 216429, + "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg" + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + song_id = mobj.group('id') + + api_res = self._download_json( + 'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d' + % (song_id, int(time.time() * 1000)), + song_id, + transform_source=strip_jsonp, + note='Downloading information of song %s' % song_id + ) + + return { + 'id': song_id, + 'title': api_res.get('name'), + 'url': api_res.get('url'), + 'uploader': api_res.get('artist', {}).get('name'), + 'uploader_id': api_res.get('artist', {}).get('id'), + 'thumbnail': api_res.get('image', api_res.get('thumbnail')), + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py new file mode 100644 index 000000000..55b58e5e6 --- /dev/null +++ b/youtube_dl/extractor/ruhd.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RUHDIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _TEST = { + 'url': 'http://www.ruhd.ru/play.php?vid=207', + 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', + 'info_dict': { + 'id': '207', + 'ext': 'divx', + 'title': 'КОТ бааааам', + 'description': 'классный кот)', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'<param name="src" value="([^"]+)"', webpage, 'video url') + title = self._html_search_regex( + r'<title>([^<]+) RUHD.ru - Видео Высокого качества №1 в России!</title>', webpage, 'title') + description = self._html_search_regex( + r'(?s)<div id="longdesc">(.+?)<span id="showlink">', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'<param name="previewImage" value="([^"]+)"', webpage, 'thumbnail', fatal=False) + if thumbnail: + thumbnail = 'http://www.ruhd.ru' + thumbnail + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 14ec9452d..8a77c1370 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -81,16 +81,16 @@ class SoundcloudIE(InfoExtractor): }, # downloadable song { - 'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1', - 'md5': '56a8b69568acaa967b4c49f9d1d52d19', + 'url': 'https://soundcloud.com/oddsamples/bus-brakes', + 'md5': 'fee7b8747b09bb755cefd4b853e7249a', 'info_dict': { - 'id': '105614606', + 'id': '128590877', 'ext': 'wav', - 'title': 'Just Your Problem Baby (Acapella)', - 'description': 'Vocals', - 'uploader': 'Sim Gretina', - 'upload_date': '20130815', - #'duration': 42, + 'title': 'Bus Brakes', + 'description': 'md5:0170be75dd395c96025d210d261c784e', + 'uploader': 'oddsamples', + 'upload_date': '20140109', + 'duration': 17, }, }, ] diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southpark.py index aea8e6439..c20397b3d 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southpark.py @@ -3,24 +3,24 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -class SouthParkStudiosIE(MTVServicesInfoExtractor): - IE_NAME = 'southparkstudios.com' - _VALID_URL = r'https?://(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' +class SouthParkIE(MTVServicesInfoExtractor): + IE_NAME = 'southpark.cc.com' + _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' _TESTS = [{ - 'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', + 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', 'info_dict': { 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', - 'title': 'Bat Daded', + 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', }, }] -class SouthparkDeIE(SouthParkStudiosIE): +class SouthparkDeIE(SouthParkIE): IE_NAME = 'southpark.de' _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index ad175b83e..d848ee186 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveIE from .discovery import DiscoveryIE +from ..utils import compat_urlparse class TlcIE(DiscoveryIE): @@ -51,6 +52,10 @@ class TlcDeIE(InfoExtractor): # Otherwise we don't get the correct 'BrightcoveExperience' element, # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ iframe_url = iframe_url.replace('.htm?', '.php?') + url_fragment = compat_urlparse.urlparse(url).fragment + if url_fragment: + # Since the fragment is not send to the server, we always get the same iframe + iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) iframe = self._download_webpage(iframe_url, title) return { diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index c980153ec..d516b6427 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,21 +1,21 @@ from __future__ import unicode_literals + import base64 import re from .common import InfoExtractor -from ..utils import ( - compat_parse_qs, -) +from ..utils import compat_parse_qs class TutvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' _TEST = { - 'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', - 'file': '2742556.flv', - 'md5': '5eb766671f69b82e528dc1e7769c5cb2', + 'url': 'http://tu.tv/videos/robots-futbolistas', + 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7', 'info_dict': { - 'title': 'Noah en pabellon cuahutemoc', + 'id': '2973058', + 'ext': 'flv', + 'title': 'Robots futbolistas', }, } @@ -26,10 +26,9 @@ class TutvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) - data_content = self._download_webpage(data_url, video_id, note='Downloading video info') - data = compat_parse_qs(data_content) - video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') + data_content = self._download_webpage( + 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') + video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') return { 'id': internal_id, diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 3bbb07704..ae5bca2e6 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -11,6 +11,7 @@ class JSInterpreter(object): def __init__(self, code): self.code = code self._functions = {} + self._objects = {} def interpret_statement(self, stmt, local_vars, allow_recursion=20): if allow_recursion < 0: @@ -55,7 +56,19 @@ class JSInterpreter(object): m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) if m: member = m.group('member') - val = local_vars[m.group('in')] + variable = m.group('in') + + if variable not in local_vars: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + key, args = member.split('(', 1) + args = args.strip(')') + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in args.split(',')] + return obj[key](argvals) + + val = local_vars[variable] if member == 'split("")': return list(val) if member == 'join("")': @@ -97,6 +110,25 @@ class JSInterpreter(object): return self._functions[fname](argvals) raise ExtractorError('Unsupported JS expression %r' % expr) + def extract_object(self, objname): + obj = {} + obj_m = re.search( + (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + r'\s*(?P<fields>([a-zA-Z$]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' + + r'\}\s*;', + self.code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer( + r'(?P<key>[a-zA-Z$]+)\s*:\s*function' + r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[f.group('key')] = self.build_function(argnames, f.group('code')) + + return obj + def extract_function(self, funcname): func_m = re.search( (r'(?:function %s|[{;]%s\s*=\s*function)' % ( @@ -107,10 +139,12 @@ class JSInterpreter(object): raise ExtractorError('Could not find JS function %r' % funcname) argnames = func_m.group('args').split(',') + return self.build_function(argnames, func_m.group('code')) + + def build_function(self, argnames, code): def resf(args): local_vars = dict(zip(argnames, args)) - for stmt in func_m.group('code').split(';'): + for stmt in code.split(';'): res = self.interpret_statement(stmt, local_vars) return res return resf - diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2cba2bfc1..64a9618ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1428,7 +1428,7 @@ US_RATINGS = { def strip_jsonp(code): - return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code) + return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) def qualities(quality_ids): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2c9591630..4d606c3d2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.11.3' +__version__ = '2014.07.15' |