X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=2d12e2df93f6f1e885600e49f0eb6f587494e19f;hb=d6c7a367e88096bb17e323954002c084477fe908;hp=201802cee6e56cbfffeed573c7fc42592a33fdab;hpb=72836fcee453386f4f16325c5b8fa4c1ba1bb442;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 201802cee..2d12e2df9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import ctypes import datetime import email.utils import errno @@ -8,10 +9,14 @@ import gzip import io import json import locale +import math import os +import pipes import platform import re +import ssl import socket +import subprocess import sys import traceback import zlib @@ -66,6 +71,12 @@ try: except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -169,7 +180,7 @@ def compat_ord(c): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -223,6 +234,19 @@ else: return f return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -249,7 +273,17 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): + def __init(self): + compat_html_parser.HTMLParser.__init__(self) + self.html = None + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + +class AttrParser(BaseHTMLParser): """Modified HTMLParser that isolates a tag with the specified attribute""" def __init__(self, attribute, value): self.attribute = attribute @@ -257,10 +291,9 @@ class AttrParser(compat_html_parser.HTMLParser): self.result = None self.started = False self.depth = {} - self.html = None self.watch_startpos = False self.error_count = 0 - compat_html_parser.HTMLParser.__init__(self) + BaseHTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: @@ -269,11 +302,6 @@ class AttrParser(compat_html_parser.HTMLParser): self.error_count += 1 self.goahead(1) - def loads(self, html): - self.html = html - self.feed(html) - self.close() - def handle_starttag(self, tag, attrs): attrs = dict(attrs) if self.started: @@ -334,6 +362,38 @@ def get_element_by_attribute(attribute, value, html): pass return parser.get_result() +class MetaParser(BaseHTMLParser): + """ + Modified HTMLParser that isolates a meta tag with the specified name + attribute. + """ + def __init__(self, name): + BaseHTMLParser.__init__(self) + self.name = name + self.content = None + self.result = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + attrs = dict(attrs) + if attrs.get('name') == self.name: + self.result = attrs.get('content') + + def get_result(self): + return self.result + +def get_meta_content(name, html): + """ + Return the content attribute from the meta tag with the given name attribute. + """ + parser = MetaParser(name) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -479,18 +539,38 @@ def formatSeconds(secs): else: return '%d' % secs -def make_HTTPS_handler(opts): - if sys.version_info < (3,2): - # Python's 2.x handler is very simplistic - return compat_urllib_request.HTTPSHandler() +def make_HTTPS_handler(opts_no_check_certificate): + if sys.version_info < (3, 2): + import httplib + + class HTTPSConnectionV3(httplib.HTTPSConnection): + def __init__(self, *args, **kwargs): + httplib.HTTPSConnection.__init__(self, *args, **kwargs) + + def connect(self): + sock = socket.create_connection((self.host, self.port), self.timeout) + if getattr(self, '_tunnel_host', False): + self.sock = sock + self._tunnel() + try: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) + except ssl.SSLError: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) + + class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): + def https_open(self, req): + return self.do_open(HTTPSConnectionV3, req) + return HTTPSHandlerV3() else: - import ssl - context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) - context.set_default_verify_paths() - + context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) context.verify_mode = (ssl.CERT_NONE - if opts.no_check_certificate + if opts_no_check_certificate else ssl.CERT_REQUIRED) + context.set_default_verify_paths() + try: + context.load_default_certs() + except AttributeError: + pass # Python < 3.4 return compat_urllib_request.HTTPSHandler(context=context) class ExtractorError(Exception): @@ -516,6 +596,11 @@ class ExtractorError(Exception): return u''.join(traceback.format_tb(self.traceback)) +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass + + class DownloadError(Exception): """Download Error exception. @@ -664,7 +749,19 @@ def unified_strdate(date_str): date_str = date_str.replace(',',' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] + format_expressions = [ + '%d %B %Y', + '%B %d %Y', + '%b %d %Y', + '%Y-%m-%d', + '%d/%m/%Y', + '%Y/%m/%d %H:%M:%S', + '%d.%m.%Y %H:%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + ] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -745,6 +842,18 @@ def platform_name(): return res +def write_string(s, out=None): + if out is None: + out = sys.stderr + assert type(s) == type(u'') + + if ('b' in getattr(out, 'mode', '') or + sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr + s = s.encode(preferredencoding(), 'ignore') + out.write(s) + out.flush() + + def bytes_to_intlist(bs): if not bs: return [] @@ -761,3 +870,224 @@ def intlist_to_bytes(xs): return ''.join([chr(x) for x in xs]) else: return bytes(xs) + + +def get_cachedir(params={}): + cache_root = os.environ.get('XDG_CACHE_HOME', + os.path.expanduser('~/.cache')) + return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) + + +# Cross-platform file locking +if sys.platform == 'win32': + import ctypes.wintypes + import msvcrt + + class OVERLAPPED(ctypes.Structure): + _fields_ = [ + ('Internal', ctypes.wintypes.LPVOID), + ('InternalHigh', ctypes.wintypes.LPVOID), + ('Offset', ctypes.wintypes.DWORD), + ('OffsetHigh', ctypes.wintypes.DWORD), + ('hEvent', ctypes.wintypes.HANDLE), + ] + + kernel32 = ctypes.windll.kernel32 + LockFileEx = kernel32.LockFileEx + LockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwFlags + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + LockFileEx.restype = ctypes.wintypes.BOOL + UnlockFileEx = kernel32.UnlockFileEx + UnlockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + UnlockFileEx.restype = ctypes.wintypes.BOOL + whole_low = 0xffffffff + whole_high = 0x7fffffff + + def _lock_file(f, exclusive): + overlapped = OVERLAPPED() + overlapped.Offset = 0 + overlapped.OffsetHigh = 0 + overlapped.hEvent = 0 + f._lock_file_overlapped_p = ctypes.pointer(overlapped) + handle = msvcrt.get_osfhandle(f.fileno()) + if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Locking file failed: %r' % ctypes.FormatError()) + + def _unlock_file(f): + assert f._lock_file_overlapped_p + handle = msvcrt.get_osfhandle(f.fileno()) + if not UnlockFileEx(handle, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) + +else: + import fcntl + + def _lock_file(f, exclusive): + fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + + def _unlock_file(f): + fcntl.lockf(f, fcntl.LOCK_UN) + + +class locked_file(object): + def __init__(self, filename, mode, encoding=None): + assert mode in ['r', 'a', 'w'] + self.f = io.open(filename, mode, encoding=encoding) + self.mode = mode + + def __enter__(self): + exclusive = self.mode != 'r' + try: + _lock_file(self.f, exclusive) + except IOError: + self.f.close() + raise + return self + + def __exit__(self, etype, value, traceback): + try: + _unlock_file(self.f) + finally: + self.f.close() + + def __iter__(self): + return iter(self.f) + + def write(self, *args): + return self.f.write(*args) + + def read(self, *args): + return self.f.read(*args) + + +def shell_quote(args): + quoted_args = [] + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + for a in args: + if isinstance(a, bytes): + # We may get a filename encoded with 'encodeFilename' + a = a.decode(encoding) + quoted_args.append(pipes.quote(a)) + return u' '.join(quoted_args) + + +def takewhile_inclusive(pred, seq): + """ Like itertools.takewhile, but include the latest evaluated element + (the first element so that Not pred(e)) """ + for e in seq: + yield e + if not pred(e): + return + + +def smuggle_url(url, data): + """ Pass additional data in a URL for internal use. """ + + sdata = compat_urllib_parse.urlencode( + {u'__youtubedl_smuggle': json.dumps(data)}) + return url + u'#' + sdata + + +def unsmuggle_url(smug_url): + if not '#__youtubedl_smuggle' in smug_url: + return smug_url, None + url, _, sdata = smug_url.rpartition(u'#') + jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] + data = json.loads(jsond) + return url, data + + +def format_bytes(bytes): + if bytes is None: + return u'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return u'%.2f%s' % (converted, suffix) + + +def str_to_int(int_str): + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) + + +def get_term_width(): + columns = os.environ.get('COLUMNS', None) + if columns: + return int(columns) + + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + return int(out.split()[1]) + except: + pass + return None + + +def month_by_name(name): + """ Return the number of a month by (locale-independently) English name """ + + ENGLISH_NAMES = [ + u'January', u'February', u'March', u'April', u'May', u'June', + u'July', u'August', u'September', u'October', u'November', u'December'] + try: + return ENGLISH_NAMES.index(name) + 1 + except ValueError: + return None + + +def fix_xml_all_ampersand(xml_str): + """Replace all the '&' by '&' in XML""" + return xml_str.replace(u'&', u'&') + + +def setproctitle(title): + assert isinstance(title, type(u'')) + try: + libc = ctypes.cdll.LoadLibrary("libc.so.6") + except OSError: + return + title = title + buf = ctypes.create_string_buffer(len(title) + 1) + buf.value = title.encode('utf-8') + try: + libc.prctl(15, ctypes.byref(buf), 0, 0, 0) + except AttributeError: + return # Strange libc, just skip this + + +def remove_start(s, start): + if s.startswith(start): + return s[len(start):] + return s + + +def url_basename(url): + m = re.match(r'(?:https?:|)//[^/]+/(?:[^?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) + if not m: + return u'' + return m.group(1)