X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=65556d056a7edfe12b94e82d0f55baa93360aa70;hb=a2973eb59733c5f86a249c627d654b789020bc7d;hp=de09b53b2d81150e355788f8f870dcaf112ae296;hpb=d47e980d0de4aeeca757433365836b61c2ee9f9f;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index de09b53b2..65556d056 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -35,6 +36,7 @@ import zlib from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -62,6 +64,8 @@ std_headers = { } +NO_DEFAULT = object() + ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] @@ -137,21 +141,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + assert re.match(r'^[a-zA-Z_-]+$', key) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None @@ -171,20 +178,59 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) - n = node.find(xpath) - if n is None or n.text is None: - if fatal: + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break + + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) else: return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: + return None return n.text +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = '%s[@%s]' % (xpath, key) if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: + return None + return n.attrib[key] + + def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" return get_element_by_attribute("id", id, html) @@ -213,6 +259,15 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + for (attribute_name, attribute_value) in attributes: + attributes_dict[attribute_name] = attribute_value + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -320,20 +375,13 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) -def sanitize_url_path_consecutive_slashes(url): - """Collapses consecutive slashes in URLs' path""" - parsed_url = list(compat_urlparse.urlparse(url)) - parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) - return compat_urlparse.urlunparse(parsed_url) - - def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -579,16 +627,19 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting + # expected HTTP responses to meet HTTP/1.0 or later (see also + # https://github.com/rg3/youtube-dl/issues/6727) + if sys.version_info < (3, 0): + kwargs[b'strict'] = True hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -653,6 +704,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib @@ -697,6 +768,18 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/rg3/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped return resp https_request = http_request @@ -720,15 +803,41 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + # Python 2 will choke on next HTTP request in row if there are non-ASCII + # characters in Set-Cookie HTTP header of last response (see + # https://github.com/rg3/youtube-dl/issues/6769). + # In order to at least prevent crashing we will percent encode Set-Cookie + # header before HTTPCookieProcessor starts processing it. + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped + return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_response = http_response + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ if date_str is None: return None + date_str = re.sub(r'\.[0-9]+', '', date_str) + if timezone is None: m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + r'(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -741,9 +850,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass def unified_strdate(date_str, day_first=True): @@ -808,7 +920,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): @@ -1284,7 +1397,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + return default def str_or_none(v, default=None): @@ -1300,7 +1418,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): @@ -1312,10 +1435,10 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P?T)? (?: - (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| + (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1549,27 +1672,8 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree +def encode_dict(d, encoding='utf-8'): + return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) US_RATINGS = { @@ -1599,8 +1703,8 @@ def js_to_json(code): if v in ('true', 'false', 'null'): return v if v.startswith('"'): - return v - if v.startswith("'"): + v = re.sub(r"\\'", "'", v[1:-1]) + elif v.startswith("'"): v = v[1:-1] v = re.sub(r"\\\\|\\'|\"", lambda m: { '\\\\': '\\\\', @@ -1672,6 +1776,7 @@ def mimetype2ext(mt): return { 'x-ms-wmv': 'wmv', 'x-mp4-fragmented': 'mp4', + 'ttml+xml': 'ttml', }.get(res, res) @@ -1693,6 +1798,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ @@ -1842,16 +1951,15 @@ def parse_dfxp_time_expr(time_expr): return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) -def format_srt_time(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - millisecs = (secs - int(secs)) * 1000 - secs = int(secs) - return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) def dfxp2srt(dfxp_data): - _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + _x = functools.partial(xpath_with_ns, ns_map={ + 'ttml': 'http://www.w3.org/ns/ttml', + 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + }) def parse_node(node): str_or_empty = functools.partial(str_or_none, default='') @@ -1859,29 +1967,524 @@ def dfxp2srt(dfxp_data): out = str_or_empty(node.text) for child in node: - if child.tag == _x('ttml:br'): + if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): out += '\n' + str_or_empty(child.tail) - elif child.tag == _x('ttml:span'): + elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): out += str_or_empty(parse_node(child)) else: out += str_or_empty(xml.etree.ElementTree.tostring(child)) return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib['begin']) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + if not end_time: + end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) out.append('%d\n%s --> %s\n%s\n\n' % ( index, - format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), - format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), parse_node(para))) return ''.join(out) +def cli_option(params, command_option, param): + param = params.get(param) + return [command_option, param] if param is not None else [] + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): + param = params.get(param) + assert isinstance(param, bool) + if separator: + return [command_option + separator + (true_value if param else false_value)] + return [command_option, true_value if param else false_value] + + +def cli_valueless_option(params, command_option, param, expected_value=True): + param = params.get(param) + return [command_option] if param == expected_value else [] + + +def cli_configuration_args(params, param, default=[]): + ex_args = params.get(param) + if ex_args is None: + return default + assert isinstance(ex_args, list) + return ex_args + + +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + +class ISO3166Utils(object): + # From http://data.okfn.org/data/core/country-list + _country_map = { + 'AF': 'Afghanistan', + 'AX': 'Åland Islands', + 'AL': 'Albania', + 'DZ': 'Algeria', + 'AS': 'American Samoa', + 'AD': 'Andorra', + 'AO': 'Angola', + 'AI': 'Anguilla', + 'AQ': 'Antarctica', + 'AG': 'Antigua and Barbuda', + 'AR': 'Argentina', + 'AM': 'Armenia', + 'AW': 'Aruba', + 'AU': 'Australia', + 'AT': 'Austria', + 'AZ': 'Azerbaijan', + 'BS': 'Bahamas', + 'BH': 'Bahrain', + 'BD': 'Bangladesh', + 'BB': 'Barbados', + 'BY': 'Belarus', + 'BE': 'Belgium', + 'BZ': 'Belize', + 'BJ': 'Benin', + 'BM': 'Bermuda', + 'BT': 'Bhutan', + 'BO': 'Bolivia, Plurinational State of', + 'BQ': 'Bonaire, Sint Eustatius and Saba', + 'BA': 'Bosnia and Herzegovina', + 'BW': 'Botswana', + 'BV': 'Bouvet Island', + 'BR': 'Brazil', + 'IO': 'British Indian Ocean Territory', + 'BN': 'Brunei Darussalam', + 'BG': 'Bulgaria', + 'BF': 'Burkina Faso', + 'BI': 'Burundi', + 'KH': 'Cambodia', + 'CM': 'Cameroon', + 'CA': 'Canada', + 'CV': 'Cape Verde', + 'KY': 'Cayman Islands', + 'CF': 'Central African Republic', + 'TD': 'Chad', + 'CL': 'Chile', + 'CN': 'China', + 'CX': 'Christmas Island', + 'CC': 'Cocos (Keeling) Islands', + 'CO': 'Colombia', + 'KM': 'Comoros', + 'CG': 'Congo', + 'CD': 'Congo, the Democratic Republic of the', + 'CK': 'Cook Islands', + 'CR': 'Costa Rica', + 'CI': 'Côte d\'Ivoire', + 'HR': 'Croatia', + 'CU': 'Cuba', + 'CW': 'Curaçao', + 'CY': 'Cyprus', + 'CZ': 'Czech Republic', + 'DK': 'Denmark', + 'DJ': 'Djibouti', + 'DM': 'Dominica', + 'DO': 'Dominican Republic', + 'EC': 'Ecuador', + 'EG': 'Egypt', + 'SV': 'El Salvador', + 'GQ': 'Equatorial Guinea', + 'ER': 'Eritrea', + 'EE': 'Estonia', + 'ET': 'Ethiopia', + 'FK': 'Falkland Islands (Malvinas)', + 'FO': 'Faroe Islands', + 'FJ': 'Fiji', + 'FI': 'Finland', + 'FR': 'France', + 'GF': 'French Guiana', + 'PF': 'French Polynesia', + 'TF': 'French Southern Territories', + 'GA': 'Gabon', + 'GM': 'Gambia', + 'GE': 'Georgia', + 'DE': 'Germany', + 'GH': 'Ghana', + 'GI': 'Gibraltar', + 'GR': 'Greece', + 'GL': 'Greenland', + 'GD': 'Grenada', + 'GP': 'Guadeloupe', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GG': 'Guernsey', + 'GN': 'Guinea', + 'GW': 'Guinea-Bissau', + 'GY': 'Guyana', + 'HT': 'Haiti', + 'HM': 'Heard Island and McDonald Islands', + 'VA': 'Holy See (Vatican City State)', + 'HN': 'Honduras', + 'HK': 'Hong Kong', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IN': 'India', + 'ID': 'Indonesia', + 'IR': 'Iran, Islamic Republic of', + 'IQ': 'Iraq', + 'IE': 'Ireland', + 'IM': 'Isle of Man', + 'IL': 'Israel', + 'IT': 'Italy', + 'JM': 'Jamaica', + 'JP': 'Japan', + 'JE': 'Jersey', + 'JO': 'Jordan', + 'KZ': 'Kazakhstan', + 'KE': 'Kenya', + 'KI': 'Kiribati', + 'KP': 'Korea, Democratic People\'s Republic of', + 'KR': 'Korea, Republic of', + 'KW': 'Kuwait', + 'KG': 'Kyrgyzstan', + 'LA': 'Lao People\'s Democratic Republic', + 'LV': 'Latvia', + 'LB': 'Lebanon', + 'LS': 'Lesotho', + 'LR': 'Liberia', + 'LY': 'Libya', + 'LI': 'Liechtenstein', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'MO': 'Macao', + 'MK': 'Macedonia, the Former Yugoslav Republic of', + 'MG': 'Madagascar', + 'MW': 'Malawi', + 'MY': 'Malaysia', + 'MV': 'Maldives', + 'ML': 'Mali', + 'MT': 'Malta', + 'MH': 'Marshall Islands', + 'MQ': 'Martinique', + 'MR': 'Mauritania', + 'MU': 'Mauritius', + 'YT': 'Mayotte', + 'MX': 'Mexico', + 'FM': 'Micronesia, Federated States of', + 'MD': 'Moldova, Republic of', + 'MC': 'Monaco', + 'MN': 'Mongolia', + 'ME': 'Montenegro', + 'MS': 'Montserrat', + 'MA': 'Morocco', + 'MZ': 'Mozambique', + 'MM': 'Myanmar', + 'NA': 'Namibia', + 'NR': 'Nauru', + 'NP': 'Nepal', + 'NL': 'Netherlands', + 'NC': 'New Caledonia', + 'NZ': 'New Zealand', + 'NI': 'Nicaragua', + 'NE': 'Niger', + 'NG': 'Nigeria', + 'NU': 'Niue', + 'NF': 'Norfolk Island', + 'MP': 'Northern Mariana Islands', + 'NO': 'Norway', + 'OM': 'Oman', + 'PK': 'Pakistan', + 'PW': 'Palau', + 'PS': 'Palestine, State of', + 'PA': 'Panama', + 'PG': 'Papua New Guinea', + 'PY': 'Paraguay', + 'PE': 'Peru', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PT': 'Portugal', + 'PR': 'Puerto Rico', + 'QA': 'Qatar', + 'RE': 'Réunion', + 'RO': 'Romania', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'BL': 'Saint Barthélemy', + 'SH': 'Saint Helena, Ascension and Tristan da Cunha', + 'KN': 'Saint Kitts and Nevis', + 'LC': 'Saint Lucia', + 'MF': 'Saint Martin (French part)', + 'PM': 'Saint Pierre and Miquelon', + 'VC': 'Saint Vincent and the Grenadines', + 'WS': 'Samoa', + 'SM': 'San Marino', + 'ST': 'Sao Tome and Principe', + 'SA': 'Saudi Arabia', + 'SN': 'Senegal', + 'RS': 'Serbia', + 'SC': 'Seychelles', + 'SL': 'Sierra Leone', + 'SG': 'Singapore', + 'SX': 'Sint Maarten (Dutch part)', + 'SK': 'Slovakia', + 'SI': 'Slovenia', + 'SB': 'Solomon Islands', + 'SO': 'Somalia', + 'ZA': 'South Africa', + 'GS': 'South Georgia and the South Sandwich Islands', + 'SS': 'South Sudan', + 'ES': 'Spain', + 'LK': 'Sri Lanka', + 'SD': 'Sudan', + 'SR': 'Suriname', + 'SJ': 'Svalbard and Jan Mayen', + 'SZ': 'Swaziland', + 'SE': 'Sweden', + 'CH': 'Switzerland', + 'SY': 'Syrian Arab Republic', + 'TW': 'Taiwan, Province of China', + 'TJ': 'Tajikistan', + 'TZ': 'Tanzania, United Republic of', + 'TH': 'Thailand', + 'TL': 'Timor-Leste', + 'TG': 'Togo', + 'TK': 'Tokelau', + 'TO': 'Tonga', + 'TT': 'Trinidad and Tobago', + 'TN': 'Tunisia', + 'TR': 'Turkey', + 'TM': 'Turkmenistan', + 'TC': 'Turks and Caicos Islands', + 'TV': 'Tuvalu', + 'UG': 'Uganda', + 'UA': 'Ukraine', + 'AE': 'United Arab Emirates', + 'GB': 'United Kingdom', + 'US': 'United States', + 'UM': 'United States Minor Outlying Islands', + 'UY': 'Uruguay', + 'UZ': 'Uzbekistan', + 'VU': 'Vanuatu', + 'VE': 'Venezuela, Bolivarian Republic of', + 'VN': 'Viet Nam', + 'VG': 'Virgin Islands, British', + 'VI': 'Virgin Islands, U.S.', + 'WF': 'Wallis and Futuna', + 'EH': 'Western Sahara', + 'YE': 'Yemen', + 'ZM': 'Zambia', + 'ZW': 'Zimbabwe', + } + + @classmethod + def short2full(cls, code): + """Convert an ISO 3166-2 country code to the corresponding full name""" + return cls._country_map.get(code.upper()) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers