X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=e747258aa22378ddd25acbe3aeb5b096c32b2df3;hb=485047854376465f95309daad4966971f56728ef;hp=dce8c7d0d5ad389aa84bf84f25731a2e680e91e3;hpb=c4251b9aaa9a69e7f7b55197b3907e52b17150d4;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dce8c7d0d..e747258aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -6,6 +7,7 @@ import hashlib import json import netrc import os +import random import re import socket import sys @@ -35,31 +37,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, + GeoRestrictedError, + GeoUtils, int_or_none, + js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -121,9 +127,19 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". - * fragments A list of fragments of the fragmented media, - with the following entries: - * "url" (mandatory) - fragment's URL + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) * preference Order number of this format. If this field is @@ -229,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -309,17 +329,34 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. (experimental) + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. (experimental) + + NB: both these geo attributes are experimental and may change in future + or be completely removed. + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ _ready = False _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False + self._x_forwarded_for_ip = None self.set_downloader(downloader) @classmethod @@ -339,7 +376,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -348,15 +385,59 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" + self._initialize_geo_bypass(self._GEO_COUNTRIES) if not self._ready: self._real_initialize() self._ready = True + def _initialize_geo_bypass(self, countries): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES. + + You may also manually call it from extractor's code if geo countries + information is not available beforehand (e.g. obtained during + extraction) or due to some another reason. + """ + if not self._x_forwarded_for_ip: + country_code = self._downloader.params.get('geo_bypass_country', None) + # If there is no explicit country for geo bypass specified and + # the extractor is known to be geo restricted let's fake IP + # as X-Forwarded-For right away. + if (not country_code and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + countries): + country_code = random.choice(countries) + if country_code: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + def extract(self, url): """Extracts URL information and returns it in list of dicts.""" try: - self.initialize() - return self._real_extract(url) + for _ in range(2): + try: + self.initialize() + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise except ExtractorError: raise except compat_http_client.IncompleteRead as e: @@ -364,6 +445,21 @@ class InfoExtractor(object): except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e) + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + not self._x_forwarded_for_ip and + countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader @@ -423,6 +519,15 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -447,6 +552,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'