X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=317a9a76fc417e9ad4455bc99b30e782849eeabc;hb=a88d461dff67205fcec684426afbcbeb4b0e7cf5;hp=6d4789d9636cc6874e2979dcfcbe56839323ad7c;hpb=dc0a869e5ee7a75218a759706bb11f17c4de6b72;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6d4789d96..317a9a76f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -26,6 +27,7 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..downloader.f4m import remove_encrypted_media from ..utils import ( @@ -36,34 +38,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, GeoRestrictedError, GeoUtils, int_or_none, js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -243,6 +246,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -370,7 +377,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -393,7 +400,6 @@ class InfoExtractor(object): is selected and a random IP belonging to this country is generated. This IP will be passed as X-Forwarded-For HTTP header in all subsequent HTTP requests. - Method does nothing if no countries are specified. This method will be used for initial geo bypass mechanism initialization during the instance initialization with _GEO_COUNTRIES. @@ -402,8 +408,6 @@ class InfoExtractor(object): information is not available beforehand (e.g. obtained during extraction) or due to some another reason. """ - if not countries: - return if not self._x_forwarded_for_ip: country_code = self._downloader.params.get('geo_bypass_country', None) # If there is no explicit country for geo bypass specified and @@ -417,8 +421,9 @@ class InfoExtractor(object): if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): - self._downloader.to_stdout( - '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -447,10 +452,12 @@ class InfoExtractor(object): self._downloader.params.get('geo_bypass', True) and not self._x_forwarded_for_ip and countries): - self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries)) + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( - 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) return True return False @@ -546,6 +553,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'