_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
- _BYPASS_GEO attribute may be set to False in order to disable
+ _GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
- country code provided with bypass_geo_restriction_as_country.
+ country code provided with geo_bypass_country. (experimental)
+
+ _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
+ countries for this extractor. One of these countries will be used by
+ geo restriction bypass mechanism right away in order to bypass
+ geo restriction, of course, if the mechanism is not disabled. (experimental)
+
+ NB: both these geo attributes are experimental and may change in future
+ or be completely removed.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
_ready = False
_downloader = None
_x_forwarded_for_ip = None
- _BYPASS_GEO = True
+ _GEO_BYPASS = True
+ _GEO_COUNTRIES = None
_WORKING = True
def __init__(self, downloader=None):
def initialize(self):
"""Initializes an instance (authentication, etc)."""
- if not self._x_forwarded_for_ip:
- country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None)
- if country_code:
- self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+ self._initialize_geo_bypass(self._GEO_COUNTRIES)
if not self._ready:
self._real_initialize()
self._ready = True
+ def _initialize_geo_bypass(self, countries):
+ """
+ Initialize geo restriction bypass mechanism.
+
+ This method is used to initialize geo bypass mechanism based on faking
+ X-Forwarded-For HTTP header. A random country from provided country list
+ is selected and a random IP belonging to this country is generated. This
+ IP will be passed as X-Forwarded-For HTTP header in all subsequent
+ HTTP requests.
+
+ This method will be used for initial geo bypass mechanism initialization
+ during the instance initialization with _GEO_COUNTRIES.
+
+ You may also manually call it from extractor's code if geo countries
+ information is not available beforehand (e.g. obtained during
+ extraction) or due to some another reason.
+ """
+ if not self._x_forwarded_for_ip:
+ country_code = self._downloader.params.get('geo_bypass_country', None)
+ # If there is no explicit country for geo bypass specified and
+ # the extractor is known to be geo restricted let's fake IP
+ # as X-Forwarded-For right away.
+ if (not country_code and
+ self._GEO_BYPASS and
+ self._downloader.params.get('geo_bypass', True) and
+ countries):
+ country_code = random.choice(countries)
+ if country_code:
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_stdout(
+ '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
+ % (self._x_forwarded_for_ip, country_code.upper()))
+
def extract(self, url):
"""Extracts URL information and returns it in list of dicts."""
try:
ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
return ie_result
except GeoRestrictedError as e:
- if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and
- self._BYPASS_GEO and
- self._downloader.params.get('bypass_geo_restriction', True) and
- not self._x_forwarded_for_ip and
- e.countries):
- self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
- if self._x_forwarded_for_ip:
- self.report_warning(
- 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
- continue
+ if self.__maybe_fake_ip_and_retry(e.countries):
+ continue
raise
except ExtractorError:
raise
except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e)
+ def __maybe_fake_ip_and_retry(self, countries):
+ if (not self._downloader.params.get('geo_bypass_country', None) and
+ self._GEO_BYPASS and
+ self._downloader.params.get('geo_bypass', True) and
+ not self._x_forwarded_for_ip and
+ countries):
+ country_code = random.choice(countries)
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+ if self._x_forwarded_for_ip:
+ self.report_warning(
+ 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
+ % (self._x_forwarded_for_ip, country_code.upper()))
+ return True
+ return False
+
def set_downloader(self, downloader):
"""Sets the downloader for this IE."""
self._downloader = downloader
})
return formats
- def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
def absolute_url(video_url):
return compat_urlparse.urljoin(base_url, video_url)
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
- entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+ entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+ preference=preference)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
})
return formats
- @staticmethod
- def _find_jwplayer_data(webpage):
+ def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
webpage)
if mobj:
- return mobj.group('options')
+ try:
+ jwplayer_data = self._parse_json(mobj.group('options'),
+ video_id=video_id,
+ transform_source=transform_source)
+ except ExtractorError:
+ pass
+ else:
+ if isinstance(jwplayer_data, dict):
+ return jwplayer_data
def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
- jwplayer_data = self._parse_json(
- self._find_jwplayer_data(webpage), video_id,
- transform_source=js_to_json)
+ jwplayer_data = self._find_jwplayer_data(
+ webpage, video_id, transform_source=js_to_json)
return self._parse_jwplayer_data(
jwplayer_data, video_id, *args, **kwargs)