diff options
| -rw-r--r-- | test/test_signatures.py | 75 | ||||
| -rw-r--r-- | youtube_dl/extractor/pbs.py | 73 | 
2 files changed, 129 insertions, 19 deletions
| diff --git a/test/test_signatures.py b/test/test_signatures.py new file mode 100644 index 000000000..a3fc53047 --- /dev/null +++ b/test/test_signatures.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import io +import re +import string + +from youtube_dl.extractor import YoutubeIE +from youtube_dl.utils import compat_str, compat_urlretrieve + +_TESTS = [ +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', +        u'js', +        86, +        u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', +    ), +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', +        u'js', +        85, +        u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', +    ), +] + + +class TestSignature(unittest.TestCase): +    def setUp(self): +        TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +        self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') +        if not os.path.exists(self.TESTDATA_DIR): +            os.mkdir(self.TESTDATA_DIR) + + +def make_tfunc(url, stype, sig_length, expected_sig): +    basename = url.rpartition('/')[2] +    m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) +    assert m, '%r should follow URL format' % basename +    test_id = m.group(1) + +    def test_func(self): +        fn = os.path.join(self.TESTDATA_DIR, basename) + +        if not os.path.exists(fn): +            compat_urlretrieve(url, fn) + +        ie = YoutubeIE() +        if stype == 'js': +            with io.open(fn, encoding='utf-8') as testf: +                jscode = testf.read() +            func = ie._parse_sig_js(jscode) +        else: +            assert stype == 'swf' +            with open(fn, 'rb') as testf: +                swfcode = testf.read() +            func = ie._parse_sig_swf(swfcode) +        src_sig = compat_str(string.printable[:sig_length]) +        got_sig = func(src_sig) +        self.assertEqual(got_sig, expected_sig) + +    test_func.__name__ = str('test_signature_' + stype + '_' + test_id) +    setattr(TestSignature, test_func.__name__, test_func) + +for test_spec in _TESTS: +    make_tfunc(*test_spec) + + +if __name__ == '__main__': +    unittest.main() diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 25f019231..7444b7b5b 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  import json @@ -5,30 +7,63 @@ from .common import InfoExtractor  class PBSIE(InfoExtractor): -    _VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?' +    _VALID_URL = r'''(?x)https?:// +        (?: +            # Direct video URL +            video\.pbs\.org/video/(?P<id>[0-9]+)/? | +            # Article with embedded player +           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | +           # Player +           video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/ +        ) +    '''      _TEST = { -        u'url': u'http://video.pbs.org/video/2365006249/', -        u'file': u'2365006249.mp4', -        u'md5': 'ce1888486f0908d555a8093cac9a7362', -        u'info_dict': { -            u'title': u'A More Perfect Union', -            u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', -            u'duration': 3190, +        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', +        'md5': 'ce1888486f0908d555a8093cac9a7362', +        'info_dict': { +            'id': '2365006249', +            'ext': 'mp4', +            'title': 'A More Perfect Union', +            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', +            'duration': 3190,          },      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') + +        presumptive_id = mobj.group('presumptive_id') +        display_id = presumptive_id +        if presumptive_id: +            webpage = self._download_webpage(url, display_id) +            url = self._search_regex( +                r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', +                webpage, 'player URL') +            mobj = re.match(self._VALID_URL, url) + +        player_id = mobj.group('player_id') +        if not display_id: +            display_id = player_id +        if player_id: +            player_page = self._download_webpage( +                url, display_id, note='Downloading player page', +                errnote='Could not download player page') +            video_id = self._search_regex( +                r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') +        else: +            video_id = mobj.group('id') +            display_id = video_id +          info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id -        info_page = self._download_webpage(info_url, video_id) -        info =json.loads(info_page) -        return {'id': video_id, -                'title': info['title'], -                'url': info['alternate_encoding']['url'], -                'ext': 'mp4', -                'description': info['program'].get('description'), -                'thumbnail': info.get('image_url'), -                'duration': info.get('duration'), -                } +        info = self._download_json(info_url, display_id) + +        return { +            'id': video_id, +            'title': info['title'], +            'url': info['alternate_encoding']['url'], +            'ext': 'mp4', +            'description': info['program'].get('description'), +            'thumbnail': info.get('image_url'), +            'duration': info.get('duration'), +        } | 
