X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=e747258aa22378ddd25acbe3aeb5b096c32b2df3;hb=485047854376465f95309daad4966971f56728ef;hp=272da74b65997fcc02153159cd512c2fb7c0f36a;hpb=4248dad92bd87650c791194276296b148f668e68;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 272da74b6..e747258aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -36,34 +37,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, GeoRestrictedError, GeoUtils, int_or_none, js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -243,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -333,6 +339,9 @@ class InfoExtractor(object): geo restriction bypass mechanism right away in order to bypass geo restriction, of course, if the mechanism is not disabled. (experimental) + NB: both these geo attributes are experimental and may change in future + or be completely removed. + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -367,7 +376,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -376,12 +385,28 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" - self.__initialize_geo_bypass() + self._initialize_geo_bypass(self._GEO_COUNTRIES) if not self._ready: self._real_initialize() self._ready = True - def __initialize_geo_bypass(self): + def _initialize_geo_bypass(self, countries): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES. + + You may also manually call it from extractor's code if geo countries + information is not available beforehand (e.g. obtained during + extraction) or due to some another reason. + """ if not self._x_forwarded_for_ip: country_code = self._downloader.params.get('geo_bypass_country', None) # If there is no explicit country for geo bypass specified and @@ -390,13 +415,14 @@ class InfoExtractor(object): if (not country_code and self._GEO_BYPASS and self._downloader.params.get('geo_bypass', True) and - self._GEO_COUNTRIES): - country_code = random.choice(self._GEO_COUNTRIES) + countries): + country_code = random.choice(countries) if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): - self._downloader.to_stdout( - '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -425,10 +451,12 @@ class InfoExtractor(object): self._downloader.params.get('geo_bypass', True) and not self._x_forwarded_for_ip and countries): - self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries)) + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( - 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) return True return False @@ -524,6 +552,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'