diff options
| author | Sergey M․ <dstftw@gmail.com> | 2018-05-02 07:18:01 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2018-05-02 07:20:59 +0700 | 
| commit | 5f95927a62a533b9e616abb5f1481cedeaa16a4a (patch) | |
| tree | 13e1cbddf07dd4259211ede5df0d2705604ca5b1 /youtube_dl/extractor/common.py | |
| parent | a93ce61bd5cbe7779e4eff0f8ab74a8a02211285 (diff) | |
Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 97 | 
1 files changed, 80 insertions, 17 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9939b0fd..3ef5af13c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -346,6 +346,11 @@ class InfoExtractor(object):      geo restriction bypass mechanism right away in order to bypass      geo restriction, of course, if the mechanism is not disabled. (experimental) +    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted +    IP blocks in CIDR notation for this extractor. One of these IP blocks +    will be used by geo restriction bypass mechanism similarly +    to _GEO_COUNTRIES. (experimental) +      NB: both these geo attributes are experimental and may change in future      or be completely removed. @@ -358,6 +363,7 @@ class InfoExtractor(object):      _x_forwarded_for_ip = None      _GEO_BYPASS = True      _GEO_COUNTRIES = None +    _GEO_IP_BLOCKS = None      _WORKING = True      def __init__(self, downloader=None): @@ -392,12 +398,15 @@ class InfoExtractor(object):      def initialize(self):          """Initializes an instance (authentication, etc).""" -        self._initialize_geo_bypass(self._GEO_COUNTRIES) +        self._initialize_geo_bypass({ +            'countries': self._GEO_COUNTRIES, +            'ip_blocks': self._GEO_IP_BLOCKS, +        })          if not self._ready:              self._real_initialize()              self._ready = True -    def _initialize_geo_bypass(self, countries): +    def _initialize_geo_bypass(self, geo_bypass_context):          """          Initialize geo restriction bypass mechanism. @@ -408,28 +417,82 @@ class InfoExtractor(object):          HTTP requests.          This method will be used for initial geo bypass mechanism initialization -        during the instance initialization with _GEO_COUNTRIES. +        during the instance initialization with _GEO_COUNTRIES and +        _GEO_IP_BLOCKS. -        You may also manually call it from extractor's code if geo countries +        You may also manually call it from extractor's code if geo bypass          information is not available beforehand (e.g. obtained during -        extraction) or due to some another reason. +        extraction) or due to some other reason. In this case you should pass +        this information in geo bypass context passed as first argument. It may +        contain following fields: + +        countries:  List of geo unrestricted countries (similar +                    to _GEO_COUNTRIES) +        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation +                    (similar to _GEO_IP_BLOCKS) +          """          if not self._x_forwarded_for_ip: -            country_code = self._downloader.params.get('geo_bypass_country', None) -            # If there is no explicit country for geo bypass specified and -            # the extractor is known to be geo restricted let's fake IP -            # as X-Forwarded-For right away. -            if (not country_code and -                    self._GEO_BYPASS and -                    self._downloader.params.get('geo_bypass', True) and -                    countries): -                country_code = random.choice(countries) -            if country_code: -                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + +            # Geo bypass mechanism is explicitly disabled by user +            if not self._downloader.params.get('geo_bypass', True): +                return + +            if not geo_bypass_context: +                geo_bypass_context = {} + +            # Backward compatibility: previously _initialize_geo_bypass +            # expected a list of countries, some 3rd party code may still use +            # it this way +            if isinstance(geo_bypass_context, (list, tuple)): +                geo_bypass_context = { +                    'countries': geo_bypass_context, +                } + +            # The whole point of geo bypass mechanism is to fake IP +            # as X-Forwarded-For HTTP header based on some IP block or +            # country code. + +            # Path 1: bypassing based on IP block in CIDR notation + +            # Explicit IP block specified by user, use it right away +            # regardless of whether extractor is geo bypassable or not +            ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + +            # Otherwise use random IP block from geo bypass context but only +            # if extractor is known as geo bypassable +            if not ip_block: +                ip_blocks = geo_bypass_context.get('ip_blocks') +                if self._GEO_BYPASS and ip_blocks: +                    ip_block = random.choice(ip_blocks) + +            if ip_block: +                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) +                if self._downloader.params.get('verbose', False): +                    self._downloader.to_screen( +                        '[debug] Using fake IP %s as X-Forwarded-For.' +                        % self._x_forwarded_for_ip) +                return + +            # Path 2: bypassing based on country code + +            # Explicit country code specified by user, use it right away +            # regardless of whether extractor is geo bypassable or not +            country = self._downloader.params.get('geo_bypass_country', None) + +            # Otherwise use random country code from geo bypass context but +            # only if extractor is known as geo bypassable +            if not country: +                countries = geo_bypass_context.get('countries') +                if self._GEO_BYPASS and countries: +                    country = random.choice(countries) + +            if country: +                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)                  if self._downloader.params.get('verbose', False):                      self._downloader.to_screen(                          '[debug] Using fake IP %s (%s) as X-Forwarded-For.' -                        % (self._x_forwarded_for_ip, country_code.upper())) +                        % (self._x_forwarded_for_ip, country.upper()))      def extract(self, url):          """Extracts URL information and returns it in list of dicts.""" | 
