X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=a12b0a7de4b00e7879ea4af75d2b4f333dc8f53a;hb=121c09c7be1ac2944f3432122104c1952bfd1f04;hp=bfe88b40bac05eea67d065953707c88549a190b6;hpb=3ba098a6a5cfba528b3931c38790b582494031f8;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bfe88b40b..a12b0a7de 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -41,6 +41,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + shlex_quote, ) @@ -55,6 +56,7 @@ std_headers = { 'Accept-Language': 'en-us,en;q=0.5', } + def preferredencoding(): """Get preferred encoding. @@ -129,7 +131,7 @@ if sys.version_info >= (2, 7): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + u"[@%s='%s']" % (key, val) + expr = xpath + "[@%s='%s']" % (key, val) return node.find(expr) else: def find_xpath_attr(node, xpath, key, val): @@ -145,6 +147,8 @@ else: # On python2.6 the xml.etree.ElementTree.Element methods don't support # the namespace parameter + + def xpath_with_ns(path, ns_map): components = [c.split(':') for c in path.split('/')] replaced = [] @@ -162,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False): xpath = xpath.encode('ascii') n = node.find(xpath) - if n is None: + if n is None or n.text is None: if fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) @@ -201,6 +205,10 @@ def get_element_by_attribute(attribute, value, html): def clean_html(html): """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + # Newline vs
html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) @@ -236,9 +244,9 @@ def sanitize_open(filename, open_mode): # In case of error, try to remove win32 forbidden chars alt_filename = os.path.join( - re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) - for path_part in os.path.split(filename) - ) + re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) + for path_part in os.path.split(filename) + ) if alt_filename == filename: raise else: @@ -255,6 +263,7 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp + def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. @@ -287,6 +296,7 @@ def sanitize_filename(s, restricted=False, is_id=False): result = '_' return result + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -357,7 +367,7 @@ def encodeArgument(s): if not isinstance(s, compat_str): # Legacy code that uses byte strings # Uncomment the following line after fixing all post processors - #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) s = s.decode('ascii') return encodeFilename(s, True) @@ -371,6 +381,7 @@ def decodeOption(optval): assert isinstance(optval, compat_str) return optval + def formatSeconds(secs): if secs > 3600: return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) @@ -381,6 +392,17 @@ def formatSeconds(secs): def make_HTTPS_handler(opts_no_check_certificate, **kwargs): + if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + if opts_no_check_certificate: + context.verify_mode = ssl.CERT_NONE + try: + return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + except TypeError: + # Python 2.7.8 + # (create_default_context present but HTTPSHandler has no context=) + pass + if sys.version_info < (3, 2): import httplib @@ -402,26 +424,18 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs): def https_open(self, req): return self.do_open(HTTPSConnectionV3, req) return HTTPSHandlerV3(**kwargs) - elif hasattr(ssl, 'create_default_context'): # Python >= 3.4 - context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3 - if opts_no_check_certificate: - context.verify_mode = ssl.CERT_NONE - return compat_urllib_request.HTTPSHandler(context=context, **kwargs) else: # Python < 3.4 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = (ssl.CERT_NONE if opts_no_check_certificate else ssl.CERT_REQUIRED) context.set_default_verify_paths() - try: - context.load_default_certs() - except AttributeError: - pass # Python < 3.4 return compat_urllib_request.HTTPSHandler(context=context, **kwargs) + class ExtractorError(Exception): """Error during info extraction.""" + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in youtube-dl. @@ -434,7 +448,13 @@ class ExtractorError(Exception): if cause: msg += ' (caused by %r)' % cause if not expected: - msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg += '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' super(ExtractorError, self).__init__(msg) self.traceback = tb @@ -448,6 +468,13 @@ class ExtractorError(Exception): return ''.join(traceback.format_tb(self.traceback)) +class UnsupportedError(ExtractorError): + def __init__(self, url): + super(UnsupportedError, self).__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + class RegexNotFoundError(ExtractorError): """Error when a regex didn't match""" pass @@ -460,6 +487,7 @@ class DownloadError(Exception): configured to continue on errors. They will contain the appropriate error message. """ + def __init__(self, msg, exc_info=None): """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ super(DownloadError, self).__init__(msg) @@ -481,9 +509,11 @@ class PostProcessingError(Exception): This exception may be raised by PostProcessor's .run() method to indicate an error in the postprocessing task. """ + def __init__(self, msg): self.msg = msg + class MaxDownloadsReached(Exception): """ --max-downloads limit has been reached. """ pass @@ -513,6 +543,7 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -625,17 +656,19 @@ def parse_iso8601(date_str, delimiter='T'): return calendar.timegm(dt.timetuple()) -def unified_strdate(date_str): +def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" if date_str is None: return None - upload_date = None - #Replace commas + # Replace commas date_str = date_str.replace(',', ' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) + format_expressions = [ '%d %B %Y', '%d %b %Y', @@ -646,11 +679,7 @@ def unified_strdate(date_str): '%b %dth %Y %I:%M%p', '%Y-%m-%d', '%Y/%m/%d', - '%d.%m.%Y', - '%d/%m/%Y', - '%d/%m/%y', '%Y/%m/%d %H:%M:%S', - '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -662,6 +691,20 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] + if day_first: + format_expressions.extend([ + '%d.%m.%Y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', + ]) + else: + format_expressions.extend([ + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', + ]) for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -673,6 +716,7 @@ def unified_strdate(date_str): upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') return upload_date + def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext @@ -682,16 +726,20 @@ def determine_ext(url, default_ext='unknown_video'): else: return default_ext + def subtitles_filename(filename, sub_lang, sub_format): return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format + def date_from_str(date_str): """ Return a datetime object from a string in the format YYYYMMDD or (now|today)[+-][0-9](day|week|month|year)(s)?""" today = datetime.date.today() - if date_str == 'now'or date_str == 'today': + if date_str in ('now', 'today'): return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) match = re.match('(now|today)(?P[+-])(?P