X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=79381b3803717081b15c79b26cf9bf4173f18ef6;hb=66e289bab466079558b7acf5cea1057ae35c9bfa;hp=942f76d2452c06a261d75e03cebc999fff02874c;hpb=dc2bd20e5599b4bc32d0cf68af402d9b972a309e;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 942f76d24..79381b380 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + assert re.match(r'^[a-zA-Z_-]+$', key) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None @@ -173,12 +176,12 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): if sys.version_info < (2, 7): # Crazy 2.6 xpath = xpath.encode('ascii') n = node.find(xpath) - if n is None or n.text is None: + if n is None: if default is not NO_DEFAULT: return default elif fatal: @@ -186,9 +189,37 @@ def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): raise ExtractorError('Could not find XML element %s' % name) else: return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: + return None return n.text +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = '%s[@%s]' % (xpath, key) if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: + return None + return n.attrib[key] + + def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" return get_element_by_attribute("id", id, html) @@ -576,16 +607,19 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting + # expected HTTP responses to meet HTTP/1.0 or later (see also + # https://github.com/rg3/youtube-dl/issues/6727) + if sys.version_info < (3, 0): + kwargs['strict'] = True hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -650,6 +684,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib @@ -694,6 +748,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped return resp https_request = http_request @@ -1309,10 +1374,10 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P?T)? (?: - (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| + (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1886,6 +1951,32 @@ def dfxp2srt(dfxp_data): return ''.join(out) +def cli_option(params, command_option, param): + param = params.get(param) + return [command_option, param] if param is not None else [] + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): + param = params.get(param) + assert isinstance(param, bool) + if separator: + return [command_option + separator + (true_value if param else false_value)] + return [command_option, true_value if param else false_value] + + +def cli_valueless_option(params, command_option, param, expected_value=True): + param = params.get(param) + return [command_option] if param == expected_value else [] + + +def cli_configuration_args(params, param, default=[]): + ex_args = params.get(param) + if ex_args is None: + return default + assert isinstance(ex_args, list) + return ex_args + + class ISO639Utils(object): # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt _lang_map = {