diff options
| -rw-r--r-- | test/test_youtube_signature.py | 6 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/firedrive.py | 83 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/ndr.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/pyvideo.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/reverbnation.py | 45 | ||||
| -rw-r--r-- | youtube_dl/extractor/ruhd.py | 46 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 16 | ||||
| -rw-r--r-- | youtube_dl/extractor/southpark.py (renamed from youtube_dl/extractor/southparkstudios.py) | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/tlc.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/tutv.py | 21 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 40 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 2 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
16 files changed, 264 insertions, 39 deletions
| diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 8d46fe108..d95533959 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -34,6 +34,12 @@ _TESTS = [          u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876',      ),      ( +        u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', +        u'js', +        84, +        u'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', +    ), +    (          u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js',          u'js',          u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 89a2cb3e8..5e16a5491 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -63,6 +63,7 @@ __authors__  = (      'Ariset Llerena',      'Adam Malcontenti-Wilson',      'Tobias Bell', +    'Naglis Jonaitis',  )  __license__ = 'Public Domain' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f75939a05..14133c315 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .extremetube import ExtremeTubeIE  from .facebook import FacebookIE  from .faz import FazIE  from .fc2 import FC2IE +from .firedrive import FiredriveIE  from .firstpost import FirstpostIE  from .firsttv import FirstTVIE  from .fivemin import FiveMinIE @@ -232,6 +233,7 @@ from .radiofrance import RadioFranceIE  from .rai import RaiIE  from .rbmaradio import RBMARadioIE  from .redtube import RedTubeIE +from .reverbnation import ReverbNationIE  from .ringtv import RingTVIE  from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE @@ -240,6 +242,7 @@ from .rtbf import RTBFIE  from .rtlnow import RTLnowIE  from .rts import RTSIE  from .rtve import RTVEALaCartaIE +from .ruhd import RUHDIE  from .rutube import (      RutubeIE,      RutubeChannelIE, @@ -268,8 +271,8 @@ from .soundcloud import (      SoundcloudPlaylistIE  )  from .soundgasm import SoundgasmIE -from .southparkstudios import ( -    SouthParkStudiosIE, +from .southpark import ( +    SouthParkIE,      SouthparkDeIE,  )  from .space import SpaceIE diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py new file mode 100644 index 000000000..d26145db1 --- /dev/null +++ b/youtube_dl/extractor/firedrive.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    compat_urllib_parse, +    compat_urllib_request, +    determine_ext, +) + + +class FiredriveIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ +                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' +    _FILE_DELETED_REGEX = r'<div class="removed_file_image">' + +    _TESTS = [{ +        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', +        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', +        'info_dict': { +            'id': 'FEB892FA160EBD01', +            'ext': 'flv', +            'title': 'bbb_theora_486kbit.flv', +            'thumbnail': 're:^http://.*\.jpg$', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        url = 'http://firedrive.com/file/%s' % video_id + +        webpage = self._download_webpage(url, video_id) + +        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: +            raise ExtractorError('Video %s does not exist' % video_id, +                                 expected=True) + +        fields = dict(re.findall(r'''(?x)<input\s+ +            type="hidden"\s+ +            name="([^"]+)"\s+ +            (?:id="[^"]+"\s+)? +            value="([^"]*)" +            ''', webpage)) + +        post = compat_urllib_parse.urlencode(fields) +        req = compat_urllib_request.Request(url, post) +        req.add_header('Content-type', 'application/x-www-form-urlencoded') + +        # Apparently, this header is required for confirmation to work. +        req.add_header('Host', 'www.firedrive.com') + +        webpage = self._download_webpage(req, video_id, +                                         'Downloading video page') + +        title = self._search_regex(r'class="external_title_left">(.+)</div>', +                                   webpage, 'title') +        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, +                                       'thumbnail', fatal=False) +        if thumbnail is not None: +            thumbnail = 'http:' + thumbnail + +        ext = self._search_regex(r'type:\s?\'([^\']+)\',', +                                 webpage, 'extension', fatal=False) +        video_url = self._search_regex( +            r'file:\s?\'(http[^\']+)\',', webpage, 'file url') + +        formats = [{ +            'format_id': 'sd', +            'url': video_url, +            'ext': ext, +        }] + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index af9490ccc..228b42d2b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -158,6 +158,9 @@ class MTVServicesInfoExtractor(InfoExtractor):              if mgid.endswith('.swf'):                  mgid = mgid[:-4]          except RegexNotFoundError: +            mgid = None + +        if mgid is None or ':' not in mgid:              mgid = self._search_regex(                  [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],                  webpage, u'mgid') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 3d6096e46..94d5ba982 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -18,15 +18,15 @@ class NDRIE(InfoExtractor):      _TESTS = [          { -            'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html', -            'md5': 'e7a6079ca39d3568f4996cb858dd6708', +            'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html', +            'md5': '4a4eeafd17c3058b65f0c8f091355855',              'note': 'Video file',              'info_dict': { -                'id': '7959', +                'id': '325',                  'ext': 'mp4', -                'title': 'Markt - die ganze Sendung', -                'description': 'md5:af9179cf07f67c5c12dc6d9997e05725', -                'duration': 2655, +                'title': 'Blaue Bohnen aus Blocken', +                'description': 'md5:190d71ba2ccddc805ed01547718963bc', +                'duration': 1715,              },          },          { diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 0bc0859b4..6d5732d45 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -46,7 +46,7 @@ class PyvideoIE(InfoExtractor):              return self.url_result(m_youtube.group(1), 'Youtube')          title = self._html_search_regex( -            r'<div class="section">.*?<h3(?:\s+class="[^"]*")?>([^>]+?)</h3>', +            r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>',              webpage, 'title', flags=re.DOTALL)          video_url = self._search_regex(              [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'], diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py new file mode 100644 index 000000000..49cf427a1 --- /dev/null +++ b/youtube_dl/extractor/reverbnation.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..utils import strip_jsonp + + +class ReverbNationIE(InfoExtractor): +    _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' +    _TESTS = [{ +        'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', +        'file': '16965047.mp3', +        'md5': '3da12ebca28c67c111a7f8b262d3f7a7', +        'info_dict': { +            "title": "MONA LISA", +            "uploader": "ALKILADOS", +            "uploader_id": 216429, +            "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg" +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        song_id = mobj.group('id') + +        api_res = self._download_json( +            'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d' +                % (song_id, int(time.time() * 1000)), +            song_id, +            transform_source=strip_jsonp, +            note='Downloading information of song %s' % song_id +        ) + +        return { +            'id': song_id, +            'title': api_res.get('name'), +            'url': api_res.get('url'), +            'uploader': api_res.get('artist', {}).get('name'), +            'uploader_id': api_res.get('artist', {}).get('id'), +            'thumbnail': api_res.get('image', api_res.get('thumbnail')), +            'ext': 'mp3', +            'vcodec': 'none', +        } diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py new file mode 100644 index 000000000..55b58e5e6 --- /dev/null +++ b/youtube_dl/extractor/ruhd.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RUHDIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.ruhd.ru/play.php?vid=207', +        'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', +        'info_dict': { +            'id': '207', +            'ext': 'divx', +            'title': 'КОТ бааааам', +            'description': 'классный кот)', +            'thumbnail': 're:^http://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._html_search_regex( +            r'<param name="src" value="([^"]+)"', webpage, 'video url') +        title = self._html_search_regex( +            r'<title>([^<]+)   RUHD.ru - Видео Высокого качества №1 в России!</title>', webpage, 'title') +        description = self._html_search_regex( +            r'(?s)<div id="longdesc">(.+?)<span id="showlink">', webpage, 'description', fatal=False) +        thumbnail = self._html_search_regex( +            r'<param name="previewImage" value="([^"]+)"', webpage, 'thumbnail', fatal=False) +        if thumbnail: +            thumbnail = 'http://www.ruhd.ru' + thumbnail + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 14ec9452d..8a77c1370 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -81,16 +81,16 @@ class SoundcloudIE(InfoExtractor):          },          # downloadable song          { -            'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1', -            'md5': '56a8b69568acaa967b4c49f9d1d52d19', +            'url': 'https://soundcloud.com/oddsamples/bus-brakes', +            'md5': 'fee7b8747b09bb755cefd4b853e7249a',              'info_dict': { -                'id': '105614606', +                'id': '128590877',                  'ext': 'wav', -                'title': 'Just Your Problem Baby (Acapella)', -                'description': 'Vocals', -                'uploader': 'Sim Gretina', -                'upload_date': '20130815', -                #'duration': 42, +                'title': 'Bus Brakes', +                'description': 'md5:0170be75dd395c96025d210d261c784e', +                'uploader': 'oddsamples', +                'upload_date': '20140109', +                'duration': 17,              },          },      ] diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southpark.py index aea8e6439..c20397b3d 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southpark.py @@ -3,24 +3,24 @@ from __future__ import unicode_literals  from .mtv import MTVServicesInfoExtractor -class SouthParkStudiosIE(MTVServicesInfoExtractor): -    IE_NAME = 'southparkstudios.com' -    _VALID_URL = r'https?://(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' +class SouthParkIE(MTVServicesInfoExtractor): +    IE_NAME = 'southpark.cc.com' +    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'      _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'      _TESTS = [{ -        'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', +        'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured',          'info_dict': {              'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30',              'ext': 'mp4', -            'title': 'Bat Daded', +            'title': 'South Park|Bat Daded',              'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',          },      }] -class SouthparkDeIE(SouthParkStudiosIE): +class SouthparkDeIE(SouthParkIE):      IE_NAME = 'southpark.de'      _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'      _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index ad175b83e..d848ee186 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from .brightcove import BrightcoveIE  from .discovery import DiscoveryIE +from ..utils import compat_urlparse  class TlcIE(DiscoveryIE): @@ -51,6 +52,10 @@ class TlcDeIE(InfoExtractor):          # Otherwise we don't get the correct 'BrightcoveExperience' element,          # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/          iframe_url = iframe_url.replace('.htm?', '.php?') +        url_fragment = compat_urlparse.urlparse(url).fragment +        if url_fragment: +            # Since the fragment is not send to the server, we always get the same iframe +            iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url)          iframe = self._download_webpage(iframe_url, title)          return { diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index c980153ec..d516b6427 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,21 +1,21 @@  from __future__ import unicode_literals +  import base64  import re  from .common import InfoExtractor -from ..utils import ( -    compat_parse_qs, -) +from ..utils import compat_parse_qs  class TutvIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'      _TEST = { -        'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', -        'file': '2742556.flv', -        'md5': '5eb766671f69b82e528dc1e7769c5cb2', +        'url': 'http://tu.tv/videos/robots-futbolistas', +        'md5': '627c7c124ac2a9b5ab6addb94e0e65f7',          'info_dict': { -            'title': 'Noah en pabellon cuahutemoc', +            'id': '2973058', +            'ext': 'flv', +            'title': 'Robots futbolistas',          },      } @@ -26,10 +26,9 @@ class TutvIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') -        data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) -        data_content = self._download_webpage(data_url, video_id, note='Downloading video info') -        data = compat_parse_qs(data_content) -        video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') +        data_content = self._download_webpage( +            'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') +        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')          return {              'id': internal_id, diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 3bbb07704..ae5bca2e6 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -11,6 +11,7 @@ class JSInterpreter(object):      def __init__(self, code):          self.code = code          self._functions = {} +        self._objects = {}      def interpret_statement(self, stmt, local_vars, allow_recursion=20):          if allow_recursion < 0: @@ -55,7 +56,19 @@ class JSInterpreter(object):          m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)          if m:              member = m.group('member') -            val = local_vars[m.group('in')] +            variable = m.group('in') + +            if variable not in local_vars: +                if variable not in self._objects: +                    self._objects[variable] = self.extract_object(variable) +                obj = self._objects[variable] +                key, args = member.split('(', 1) +                args = args.strip(')') +                argvals = [int(v) if v.isdigit() else local_vars[v] +                           for v in args.split(',')] +                return obj[key](argvals) + +            val = local_vars[variable]              if member == 'split("")':                  return list(val)              if member == 'join("")': @@ -97,6 +110,25 @@ class JSInterpreter(object):              return self._functions[fname](argvals)          raise ExtractorError('Unsupported JS expression %r' % expr) +    def extract_object(self, objname): +        obj = {} +        obj_m = re.search( +            (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + +            r'\s*(?P<fields>([a-zA-Z$]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' + +            r'\}\s*;', +            self.code) +        fields = obj_m.group('fields') +        # Currently, it only supports function definitions +        fields_m = re.finditer( +            r'(?P<key>[a-zA-Z$]+)\s*:\s*function' +            r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', +            fields) +        for f in fields_m: +            argnames = f.group('args').split(',') +            obj[f.group('key')] = self.build_function(argnames, f.group('code')) + +        return obj +      def extract_function(self, funcname):          func_m = re.search(              (r'(?:function %s|[{;]%s\s*=\s*function)' % ( @@ -107,10 +139,12 @@ class JSInterpreter(object):              raise ExtractorError('Could not find JS function %r' % funcname)          argnames = func_m.group('args').split(',') +        return self.build_function(argnames, func_m.group('code')) + +    def build_function(self, argnames, code):          def resf(args):              local_vars = dict(zip(argnames, args)) -            for stmt in func_m.group('code').split(';'): +            for stmt in code.split(';'):                  res = self.interpret_statement(stmt, local_vars)              return res          return resf - diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2cba2bfc1..64a9618ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1428,7 +1428,7 @@ US_RATINGS = {  def strip_jsonp(code): -    return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code) +    return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)  def qualities(quality_ids): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2c9591630..4d606c3d2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.11.3' +__version__ = '2014.07.15' | 
