Improve geo bypass mechanism
authorSergey M․ <dstftw@gmail.com>
Wed, 2 May 2018 00:18:01 +0000 (07:18 +0700)
committerSergey M․ <dstftw@gmail.com>
Wed, 2 May 2018 00:20:59 +0000 (07:20 +0700)
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/extractor/anvato.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/common.py
youtube_dl/extractor/dplay.py
youtube_dl/extractor/go.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/tvplay.py
youtube_dl/options.py
youtube_dl/utils.py

index ad359880526116d71d13fa2a736ab02ca79e19c0..f1a35901159f89ada4aaa8b3f23135b1e1e65f80 100755 (executable)
@@ -286,6 +286,9 @@ class YoutubeDL(object):
                        Two-letter ISO 3166-2 country code that will be used for
                        explicit geographic restriction bypassing via faking
                        X-Forwarded-For HTTP header (experimental)
                        Two-letter ISO 3166-2 country code that will be used for
                        explicit geographic restriction bypassing via faking
                        X-Forwarded-For HTTP header (experimental)
+    geo_bypass_ip_block:
+                       IP range in CIDR notation that will be used similarly to
+                       geo_bypass_country (experimental)
 
     The following options determine which downloader is picked:
     external_downloader: Executable of the external downloader to call.
 
     The following options determine which downloader is picked:
     external_downloader: Executable of the external downloader to call.
index 9bb952457e149f3687efd0ca925aed8d69996366..ba435ea428c57c933a5ea113dd58f38bbf7e2cd4 100644 (file)
@@ -430,6 +430,7 @@ def _real_main(argv=None):
         'config_location': opts.config_location,
         'geo_bypass': opts.geo_bypass,
         'geo_bypass_country': opts.geo_bypass_country,
         'config_location': opts.config_location,
         'geo_bypass': opts.geo_bypass,
         'geo_bypass_country': opts.geo_bypass_country,
+        'geo_bypass_ip_block': opts.geo_bypass_ip_block,
         # just for deprecation check
         'autonumber': opts.autonumber if opts.autonumber is True else None,
         'usetitle': opts.usetitle if opts.usetitle is True else None,
         # just for deprecation check
         'autonumber': opts.autonumber if opts.autonumber is True else None,
         'usetitle': opts.usetitle if opts.usetitle is True else None,
index 7a29cd2c6315fad6caa05ac6a499f07f80a41566..f6a78eb5d4cb1288d3bc2b864303343b24603422 100644 (file)
@@ -277,7 +277,9 @@ class AnvatoIE(InfoExtractor):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         mobj = re.match(self._VALID_URL, url)
         access_key, video_id = mobj.group('access_key_or_mcp', 'id')
 
         mobj = re.match(self._VALID_URL, url)
         access_key, video_id = mobj.group('access_key_or_mcp', 'id')
index 0e4eaef659105df0248d6ef5171f5f444660f1ee..ab62e54d63c2335d0002701c078105e00ccc6186 100644 (file)
@@ -669,7 +669,10 @@ class BrightcoveNewIE(AdobePassIE):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+            'ip_blocks': smuggled_data.get('geo_ip_blocks'),
+        })
 
         account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
 
 
         account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
 
index a9939b0fdc817c42e4c09999560dfd8c76873a5c..3ef5af13c415205540fed3dd4a02bf6b6dfb1822 100644 (file)
@@ -346,6 +346,11 @@ class InfoExtractor(object):
     geo restriction bypass mechanism right away in order to bypass
     geo restriction, of course, if the mechanism is not disabled. (experimental)
 
     geo restriction bypass mechanism right away in order to bypass
     geo restriction, of course, if the mechanism is not disabled. (experimental)
 
+    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
+    IP blocks in CIDR notation for this extractor. One of these IP blocks
+    will be used by geo restriction bypass mechanism similarly
+    to _GEO_COUNTRIES. (experimental)
+
     NB: both these geo attributes are experimental and may change in future
     or be completely removed.
 
     NB: both these geo attributes are experimental and may change in future
     or be completely removed.
 
@@ -358,6 +363,7 @@ class InfoExtractor(object):
     _x_forwarded_for_ip = None
     _GEO_BYPASS = True
     _GEO_COUNTRIES = None
     _x_forwarded_for_ip = None
     _GEO_BYPASS = True
     _GEO_COUNTRIES = None
+    _GEO_IP_BLOCKS = None
     _WORKING = True
 
     def __init__(self, downloader=None):
     _WORKING = True
 
     def __init__(self, downloader=None):
@@ -392,12 +398,15 @@ class InfoExtractor(object):
 
     def initialize(self):
         """Initializes an instance (authentication, etc)."""
 
     def initialize(self):
         """Initializes an instance (authentication, etc)."""
-        self._initialize_geo_bypass(self._GEO_COUNTRIES)
+        self._initialize_geo_bypass({
+            'countries': self._GEO_COUNTRIES,
+            'ip_blocks': self._GEO_IP_BLOCKS,
+        })
         if not self._ready:
             self._real_initialize()
             self._ready = True
 
         if not self._ready:
             self._real_initialize()
             self._ready = True
 
-    def _initialize_geo_bypass(self, countries):
+    def _initialize_geo_bypass(self, geo_bypass_context):
         """
         Initialize geo restriction bypass mechanism.
 
         """
         Initialize geo restriction bypass mechanism.
 
@@ -408,28 +417,82 @@ class InfoExtractor(object):
         HTTP requests.
 
         This method will be used for initial geo bypass mechanism initialization
         HTTP requests.
 
         This method will be used for initial geo bypass mechanism initialization
-        during the instance initialization with _GEO_COUNTRIES.
+        during the instance initialization with _GEO_COUNTRIES and
+        _GEO_IP_BLOCKS.
 
 
-        You may also manually call it from extractor's code if geo countries
+        You may also manually call it from extractor's code if geo bypass
         information is not available beforehand (e.g. obtained during
         information is not available beforehand (e.g. obtained during
-        extraction) or due to some another reason.
+        extraction) or due to some other reason. In this case you should pass
+        this information in geo bypass context passed as first argument. It may
+        contain following fields:
+
+        countries:  List of geo unrestricted countries (similar
+                    to _GEO_COUNTRIES)
+        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
+                    (similar to _GEO_IP_BLOCKS)
+
         """
         if not self._x_forwarded_for_ip:
         """
         if not self._x_forwarded_for_ip:
-            country_code = self._downloader.params.get('geo_bypass_country', None)
-            # If there is no explicit country for geo bypass specified and
-            # the extractor is known to be geo restricted let's fake IP
-            # as X-Forwarded-For right away.
-            if (not country_code and
-                    self._GEO_BYPASS and
-                    self._downloader.params.get('geo_bypass', True) and
-                    countries):
-                country_code = random.choice(countries)
-            if country_code:
-                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+
+            # Geo bypass mechanism is explicitly disabled by user
+            if not self._downloader.params.get('geo_bypass', True):
+                return
+
+            if not geo_bypass_context:
+                geo_bypass_context = {}
+
+            # Backward compatibility: previously _initialize_geo_bypass
+            # expected a list of countries, some 3rd party code may still use
+            # it this way
+            if isinstance(geo_bypass_context, (list, tuple)):
+                geo_bypass_context = {
+                    'countries': geo_bypass_context,
+                }
+
+            # The whole point of geo bypass mechanism is to fake IP
+            # as X-Forwarded-For HTTP header based on some IP block or
+            # country code.
+
+            # Path 1: bypassing based on IP block in CIDR notation
+
+            # Explicit IP block specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+
+            # Otherwise use random IP block from geo bypass context but only
+            # if extractor is known as geo bypassable
+            if not ip_block:
+                ip_blocks = geo_bypass_context.get('ip_blocks')
+                if self._GEO_BYPASS and ip_blocks:
+                    ip_block = random.choice(ip_blocks)
+
+            if ip_block:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
+                if self._downloader.params.get('verbose', False):
+                    self._downloader.to_screen(
+                        '[debug] Using fake IP %s as X-Forwarded-For.'
+                        % self._x_forwarded_for_ip)
+                return
+
+            # Path 2: bypassing based on country code
+
+            # Explicit country code specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            country = self._downloader.params.get('geo_bypass_country', None)
+
+            # Otherwise use random country code from geo bypass context but
+            # only if extractor is known as geo bypassable
+            if not country:
+                countries = geo_bypass_context.get('countries')
+                if self._GEO_BYPASS and countries:
+                    country = random.choice(countries)
+
+            if country:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_screen(
                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_screen(
                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
-                        % (self._x_forwarded_for_ip, country_code.upper()))
+                        % (self._x_forwarded_for_ip, country.upper()))
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
index b734467734c30a880badab5ff72cfac7543b3b56..8e0374320783195c810d1be8fff7d44b95b32414 100644 (file)
@@ -102,7 +102,9 @@ class DPlayIE(InfoExtractor):
         display_id = mobj.group('id')
         domain = mobj.group('domain')
 
         display_id = mobj.group('id')
         domain = mobj.group('domain')
 
-        self._initialize_geo_bypass([mobj.group('country').upper()])
+        self._initialize_geo_bypass({
+            'countries': [mobj.group('country').upper()],
+        })
 
         webpage = self._download_webpage(url, display_id)
 
 
         webpage = self._download_webpage(url, display_id)
 
index 9c7b1bd37d447c5a3cb09aa6b3418b63accffcad..e781405f2f7d55aa51f8cc50cc99df22ee5dff40 100644 (file)
@@ -123,7 +123,7 @@ class GoIE(AdobePassIE):
                         'adobe_requestor_id': requestor_id,
                     })
                 else:
                         'adobe_requestor_id': requestor_id,
                     })
                 else:
-                    self._initialize_geo_bypass(['US'])
+                    self._initialize_geo_bypass({'countries': ['US']})
                 entitlement = self._download_json(
                     'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
                     video_id, data=urlencode_postdata(data))
                 entitlement = self._download_json(
                     'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
                     video_id, data=urlencode_postdata(data))
index 2803d7e8df47c92003ccd11a23c951af1644a7a1..729d8de50fab70cd69bab41fae9db0cba4d7da9b 100644 (file)
@@ -282,7 +282,9 @@ class LimelightMediaIE(LimelightBaseIE):
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         pc, mobile, metadata = self._extract(
             video_id, 'getPlaylistByMediaId',
 
         pc, mobile, metadata = self._extract(
             video_id, 'getPlaylistByMediaId',
index 84597b55e0f6047a1dccd1905cb4771949b3cf00..e09b5f804d897954f4488344d27beaa8a7a2eea6 100644 (file)
@@ -227,14 +227,16 @@ class TVPlayIE(InfoExtractor):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         video_id = self._match_id(url)
         geo_country = self._search_regex(
             r'https?://[^/]+\.([a-z]{2})', url,
             'geo country', default=None)
         if geo_country:
 
         video_id = self._match_id(url)
         geo_country = self._search_regex(
             r'https?://[^/]+\.([a-z]{2})', url,
             'geo country', default=None)
         if geo_country:
-            self._initialize_geo_bypass([geo_country.upper()])
+            self._initialize_geo_bypass({'countries': [geo_country.upper()]})
         video = self._download_json(
             'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
 
         video = self._download_json(
             'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
 
index 3e4ac03a240844ef3b69fd21dbedcc441a46e5c7..f3f8f23b6b8b623194fe4250a53bcf45c9d326c8 100644 (file)
@@ -249,6 +249,10 @@ def parseOpts(overrideArguments=None):
         '--geo-bypass-country', metavar='CODE',
         dest='geo_bypass_country', default=None,
         help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)')
         '--geo-bypass-country', metavar='CODE',
         dest='geo_bypass_country', default=None,
         help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)')
+    geo.add_option(
+        '--geo-bypass-ip-block', metavar='IP_BLOCK',
+        dest='geo_bypass_ip_block', default=None,
+        help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)')
 
     selection = optparse.OptionGroup(parser, 'Video Selection')
     selection.add_option(
 
     selection = optparse.OptionGroup(parser, 'Video Selection')
     selection.add_option(
index b460393bf37106a47304dfa97dd7b1efa37117cd..f9ca63c589ffb56fb2c18f76ee40a99795265f0a 100644 (file)
@@ -3534,10 +3534,13 @@ class GeoUtils(object):
     }
 
     @classmethod
     }
 
     @classmethod
-    def random_ipv4(cls, code):
-        block = cls._country_ip_map.get(code.upper())
-        if not block:
-            return None
+    def random_ipv4(cls, code_or_block):
+        if len(code_or_block) == 2:
+            block = cls._country_ip_map.get(code_or_block.upper())
+            if not block:
+                return None
+        else:
+            block = code_or_block
         addr, preflen = block.split('/')
         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
         addr_max = addr_min | (0xffffffff >> int(preflen))
         addr, preflen = block.split('/')
         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
         addr_max = addr_min | (0xffffffff >> int(preflen))