diff options
| author | Yen Chi Hsuan <yan12125@gmail.com> | 2017-05-01 23:09:18 +0800 | 
|---|---|---|
| committer | Yen Chi Hsuan <yan12125@gmail.com> | 2017-05-04 16:26:17 +0800 | 
| commit | 0c265486016b06342fb257966474ce591667aaff (patch) | |
| tree | 535a5d3331dd5be08e818174e630389485ad30ac | |
| parent | 5401bea27fd6bcdd030e50f3af85145807eeab27 (diff) | |
[cda] Implement birthday verification (closes #12789)
| -rw-r--r-- | ChangeLog | 1 | ||||
| -rw-r--r-- | test/test_utils.py | 11 | ||||
| -rwxr-xr-x | youtube_dl/extractor/cda.py | 52 | ||||
| -rw-r--r-- | youtube_dl/extractor/videopress.py | 9 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 61 | 
5 files changed, 125 insertions, 9 deletions
| @@ -1,6 +1,7 @@  version <unreleased>  Extractors ++ [cda] Support birthday verification (#12789)  * [leeco] Fix extraction (#12974) diff --git a/test/test_utils.py b/test/test_utils.py index 05fdc0e95..f31559e71 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -44,6 +44,7 @@ from youtube_dl.utils import (      limit_length,      mimetype2ext,      month_by_name, +    multipart_encode,      ohdave_rsa_encrypt,      OnDemandPagedList,      orderedSet, @@ -620,6 +621,16 @@ class TestUtil(unittest.TestCase):              'http://example.com/path', {'test': '第二行тест'})),              query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) +    def test_multipart_encode(self): +        self.assertEqual( +            multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0], +            b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n') +        self.assertEqual( +            multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0], +            b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n') +        self.assertRaises( +            ValueError, multipart_encode, {b'field': b'value'}, boundary='value') +      def test_dict_get(self):          FALSE_VALUES = {              'none': None, diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b501..78b7a923c 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    multipart_encode,      parse_duration, +    random_birthday, +    urljoin,  ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):              'description': 'md5:269ccd135d550da90d1662651fcb9772',              'thumbnail': r're:^https?://.*\.jpg$',              'average_rating': float, -            'duration': 39 +            'duration': 39, +            'age_limit': 0,          }      }, {          'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):              'uploader': 'crash404',              'view_count': int,              'average_rating': float, -            'duration': 137 +            'duration': 137, +            'age_limit': 0,          }      }, { +        # Age-restricted +        'url': 'http://www.cda.pl/video/1273454c4', +        'info_dict': { +            'id': '1273454c4', +            'ext': 'mp4', +            'title': 'Bronson (2008) napisy HD 1080p', +            'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', +            'height': 1080, +            'uploader': 'boniek61', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 5554, +            'age_limit': 18, +            'view_count': int, +            'average_rating': float, +        }, +    }, {          'url': 'http://ebd.cda.pl/0x0/5749950c',          'only_matching': True,      }] +    def _download_age_confirm_page(self, url, video_id, *args, **kwargs): +        form_data = random_birthday('rok', 'miesiac', 'dzien') +        form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) +        data, content_type = multipart_encode(form_data) +        return self._download_webpage( +            urljoin(url, '/a/validatebirth'), video_id, *args, +            data=data, headers={ +                'Referer': url, +                'Content-Type': content_type, +            }, **kwargs) +      def _real_extract(self, url):          video_id = self._match_id(url)          self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):          if 'Ten film jest dostępny dla użytkowników premium' in webpage:              raise ExtractorError('This video is only available for premium users.', expected=True) +        need_confirm_age = False +        if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', +                                   webpage, 'birthday validate form', default=None): +            webpage = self._download_age_confirm_page( +                url, video_id, note='Confirming age') +            need_confirm_age = True +          formats = []          uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):              'thumbnail': self._og_search_thumbnail(webpage),              'formats': formats,              'duration': None, +            'age_limit': 18 if need_confirm_age else 0,          }          def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):          for href, resolution in re.findall(                  r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',                  webpage): -            webpage = self._download_webpage( +            if need_confirm_age: +                handler = self._download_age_confirm_page +            else: +                handler = self._download_webpage + +            webpage = handler(                  self._BASE_URL + href, video_id,                  'Downloading %s version information' % resolution, fatal=False)              if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):                  # invalid version is requested.                  self.report_warning('Unable to download %s version information' % resolution)                  continue +              extract_format(webpage, resolution)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25a5..e5f964d39 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import random  import re  from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import (      float_or_none,      parse_age_limit,      qualities, +    random_birthday,      try_get,      unified_timestamp,      urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +        query = random_birthday('birth_year', 'birth_month', 'birth_day')          video = self._download_json(              'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, -            video_id, query={ -                'birth_month': random.randint(1, 12), -                'birth_day': random.randint(1, 31), -                'birth_year': random.randint(1950, 1995), -            }) +            video_id, query=query)          title = video['title'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 86fc5ccac..25bd228ab 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -11,6 +11,7 @@ import contextlib  import ctypes  import datetime  import email.utils +import email.header  import errno  import functools  import gzip @@ -2097,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):      return new_req +def try_multipart_encode(data, boundary): +    content_type = 'multipart/form-data; boundary=%s' % boundary + +    out = b'' +    for k, v in data.items(): +        out += b'--' + boundary.encode('ascii') + b'\r\n' +        if isinstance(k, compat_str): +            k = k.encode('utf-8') +        if isinstance(v, compat_str): +            v = v.encode('utf-8') +        # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 +        # suggests sending UTF-8 directly. Firefox sends UTF-8, too +        content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n' +        if boundary.encode('ascii') in content: +            raise ValueError('Boundary overlaps with data') +        out += content + +    out += b'--' + boundary.encode('ascii') + b'--\r\n' + +    return out, content_type + + +def multipart_encode(data, boundary=None): +    ''' +    Encode a dict to RFC 7578-compliant form-data + +    data: +        A dict where keys and values can be either Unicode or bytes-like +        objects. +    boundary: +        If specified a Unicode object, it's used as the boundary. Otherwise +        a random boundary is generated. + +    Reference: https://tools.ietf.org/html/rfc7578 +    ''' +    has_specified_boundary = boundary is not None + +    while True: +        if boundary is None: +            boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + +        try: +            out, content_type = try_multipart_encode(data, boundary) +            break +        except ValueError: +            if has_specified_boundary: +                raise +            boundary = None + +    return out, content_type + +  def dict_get(d, key_or_keys, default=None, skip_false_values=True):      if isinstance(key_or_keys, (list, tuple)):          for key in key_or_keys: @@ -3760,3 +3813,11 @@ def write_xattr(path, key, value):                          "Couldn't find a tool to set the xattrs. "                          "Install either the python 'xattr' module, "                          "or the 'xattr' binary.") + + +def random_birthday(year_field, month_field, day_field): +    return { +        year_field: str(random.randint(1950, 1995)), +        month_field: str(random.randint(1, 12)), +        day_field: str(random.randint(1, 31)), +    } | 
