X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=91e235ff2f6166106c93e4b8de5bbeb18a6d8b7a;hb=5b995f713bf45e9e373029e1411811dfde855088;hp=28941673fa9bec36b24464faa79cc3b2348aca99;hpb=9218a6b4f5292cc3d7761484e6b5dbe1e7a1a998;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 28941673f..91e235ff2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -23,6 +23,7 @@ import operator import os import pipes import platform +import random import re import socket import ssl @@ -38,6 +39,7 @@ from .compat import ( compat_basestring, compat_chr, compat_etree_fromstring, + compat_expanduser, compat_html_entities, compat_html_entities_html5, compat_http_client, @@ -86,6 +88,11 @@ std_headers = { } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ @@ -123,7 +130,13 @@ DATE_FORMATS = ( '%d %B %Y', '%d %b %Y', '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', '%b %dth %Y %I:%M', @@ -132,6 +145,7 @@ DATE_FORMATS = ( '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -325,17 +339,30 @@ def get_element_by_id(id, html): def get_element_by_class(class_name, html): - return get_element_by_attribute( + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), html, escape_value=False) -def get_element_by_attribute(attribute, value, html, escape_value=True): +def get_elements_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" value = re.escape(value) if escape_value else value - m = re.search(r'''(?xs) + retlist = [] + for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? @@ -343,16 +370,15 @@ def get_element_by_attribute(attribute, value, html, escape_value=True): \s*> (?P.*?) - ''' % (re.escape(attribute), value), html) + ''' % (re.escape(attribute), value), html): + res = m.group('content') - if not m: - return None - res = m.group('content') + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + retlist.append(unescapeHTML(res)) - return unescapeHTML(res) + return retlist class HTMLAttributeParser(compat_HTMLParser): @@ -448,7 +474,8 @@ def timeconvert(timestr): def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. """ def replace_insane(char): if restricted and char in ACCENT_CHARS: @@ -496,7 +523,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) @@ -513,6 +540,11 @@ def sanitized_Request(url, *args, **kwargs): return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -677,7 +709,12 @@ def bug_reports_message(): return msg -class ExtractorError(Exception): +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + pass + + +class ExtractorError(YoutubeDLError): """Error during info extraction.""" def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -718,7 +755,19 @@ class RegexNotFoundError(ExtractorError): pass -class DownloadError(Exception): +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + +class DownloadError(YoutubeDLError): """Download Error exception. This exception may be thrown by FileDownloader objects if they are not @@ -732,7 +781,7 @@ class DownloadError(Exception): self.exc_info = exc_info -class SameFileError(Exception): +class SameFileError(YoutubeDLError): """Same File exception. This exception will be thrown by FileDownloader objects if they detect @@ -741,7 +790,7 @@ class SameFileError(Exception): pass -class PostProcessingError(Exception): +class PostProcessingError(YoutubeDLError): """Post Processing exception. This exception may be raised by PostProcessor's .run() method to @@ -749,15 +798,16 @@ class PostProcessingError(Exception): """ def __init__(self, msg): + super(PostProcessingError, self).__init__(msg) self.msg = msg -class MaxDownloadsReached(Exception): +class MaxDownloadsReached(YoutubeDLError): """ --max-downloads limit has been reached. """ pass -class UnavailableVideoError(Exception): +class UnavailableVideoError(YoutubeDLError): """Unavailable Format exception. This exception will be thrown when a video is requested @@ -766,7 +816,7 @@ class UnavailableVideoError(Exception): pass -class ContentTooShortError(Exception): +class ContentTooShortError(YoutubeDLError): """Content Too Short exception. This exception may be raised by FileDownloader objects when a file they @@ -775,12 +825,15 @@ class ContentTooShortError(Exception): """ def __init__(self, downloaded, expected): + super(ContentTooShortError, self).__init__( + 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) + ) # Both in bytes self.downloaded = downloaded self.expected = expected -class XAttrMetadataError(Exception): +class XAttrMetadataError(YoutubeDLError): def __init__(self, code=None, msg='Unknown error'): super(XAttrMetadataError, self).__init__(msg) self.code = code @@ -796,7 +849,7 @@ class XAttrMetadataError(Exception): self.reason = 'NOT_SUPPORTED' -class XAttrUnavailableError(Exception): +class XAttrUnavailableError(YoutubeDLError): pass @@ -1178,7 +1231,7 @@ def date_from_str(date_str): return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match('(now|today)(?P[+-])(?P