Merge branch 'cinemassacre' of github.com:rzhxeo/youtube-dl into rzhxeo-cinemassacre
[youtube-dl] / youtube_dl / utils.py
index e6fa634a7f1823a7e5a0f75c2b2eb79b18e9d2e3..82a1daeb9075a056aa908e0a91bbd83b8897673a 100644 (file)
@@ -1,19 +1,20 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import datetime
+import email.utils
 import errno
 import gzip
 import io
 import json
 import locale
 import os
+import platform
 import re
+import socket
 import sys
 import traceback
 import zlib
-import email.utils
-import socket
-import datetime
 
 try:
     import urllib.request as compat_urllib_request
@@ -65,6 +66,12 @@ try:
 except ImportError:  # Python 2
     from urllib2 import HTTPError as compat_HTTPError
 
+try:
+    from urllib.request import urlretrieve as compat_urlretrieve
+except ImportError:  # Python 2
+    from urllib import urlretrieve as compat_urlretrieve
+
+
 try:
     from subprocess import DEVNULL
     compat_subprocess_get_DEVNULL = lambda: DEVNULL
@@ -168,7 +175,7 @@ def compat_ord(c):
 compiled_regex_type = type(re.compile(''))
 
 std_headers = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Accept-Encoding': 'gzip, deflate',
@@ -212,7 +219,7 @@ if sys.version_info >= (2,7):
     def find_xpath_attr(node, xpath, key, val):
         """ Find the xpath xpath[@key=val] """
         assert re.match(r'^[a-zA-Z]+$', key)
-        assert re.match(r'^[a-zA-Z@\s]*$', val)
+        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
         expr = xpath + u"[@%s='%s']" % (key, val)
         return node.find(expr)
 else:
@@ -248,7 +255,17 @@ def htmlentity_transform(matchobj):
     return (u'&%s;' % entity)
 
 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class AttrParser(compat_html_parser.HTMLParser):
+class BaseHTMLParser(compat_html_parser.HTMLParser):
+    def __init(self):
+        compat_html_parser.HTMLParser.__init__(self)
+        self.html = None
+
+    def loads(self, html):
+        self.html = html
+        self.feed(html)
+        self.close()
+
+class AttrParser(BaseHTMLParser):
     """Modified HTMLParser that isolates a tag with the specified attribute"""
     def __init__(self, attribute, value):
         self.attribute = attribute
@@ -256,10 +273,9 @@ class AttrParser(compat_html_parser.HTMLParser):
         self.result = None
         self.started = False
         self.depth = {}
-        self.html = None
         self.watch_startpos = False
         self.error_count = 0
-        compat_html_parser.HTMLParser.__init__(self)
+        BaseHTMLParser.__init__(self)
 
     def error(self, message):
         if self.error_count > 10 or self.started:
@@ -268,11 +284,6 @@ class AttrParser(compat_html_parser.HTMLParser):
         self.error_count += 1
         self.goahead(1)
 
-    def loads(self, html):
-        self.html = html
-        self.feed(html)
-        self.close()
-
     def handle_starttag(self, tag, attrs):
         attrs = dict(attrs)
         if self.started:
@@ -333,6 +344,38 @@ def get_element_by_attribute(attribute, value, html):
         pass
     return parser.get_result()
 
+class MetaParser(BaseHTMLParser):
+    """
+    Modified HTMLParser that isolates a meta tag with the specified name 
+    attribute.
+    """
+    def __init__(self, name):
+        BaseHTMLParser.__init__(self)
+        self.name = name
+        self.content = None
+        self.result = None
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'meta':
+            return
+        attrs = dict(attrs)
+        if attrs.get('name') == self.name:
+            self.result = attrs.get('content')
+
+    def get_result(self):
+        return self.result
+
+def get_meta_content(name, html):
+    """
+    Return the content attribute from the meta tag with the given name attribute.
+    """
+    parser = MetaParser(name)
+    try:
+        parser.loads(html)
+    except compat_html_parser.HTMLParseError:
+        pass
+    return parser.get_result()
+
 
 def clean_html(html):
     """Clean an HTML snippet into a readable string"""
@@ -628,8 +671,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
         old_resp = resp
         # gzip
         if resp.headers.get('Content-encoding', '') == 'gzip':
-            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
-            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
+            content = resp.read()
+            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
+            try:
+                uncompressed = io.BytesIO(gz.read())
+            except IOError as original_ioerror:
+                # There may be junk add the end of the file
+                # See http://stackoverflow.com/q/4928560/35070 for details
+                for i in range(1, 1024):
+                    try:
+                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
+                        uncompressed = io.BytesIO(gz.read())
+                    except IOError:
+                        continue
+                    break
+                else:
+                    raise original_ioerror
+            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
             resp.msg = old_resp.msg
         # deflate
         if resp.headers.get('Content-encoding', '') == 'deflate':
@@ -648,7 +706,17 @@ def unified_strdate(date_str):
     date_str = date_str.replace(',',' ')
     # %z (UTC offset) is only supported in python>=3.2
     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
-    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
+    format_expressions = [
+        '%d %B %Y',
+        '%B %d %Y',
+        '%b %d %Y',
+        '%Y-%m-%d',
+        '%d/%m/%Y',
+        '%Y/%m/%d %H:%M:%S',
+        '%d.%m.%Y %H:%M',
+        '%Y-%m-%dT%H:%M:%SZ',
+        '%Y-%m-%dT%H:%M:%S',
+    ]
     for expression in format_expressions:
         try:
             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -717,3 +785,145 @@ class DateRange(object):
         return self.start <= date <= self.end
     def __str__(self):
         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
+
+
+def platform_name():
+    """ Returns the platform name as a compat_str """
+    res = platform.platform()
+    if isinstance(res, bytes):
+        res = res.decode(preferredencoding())
+
+    assert isinstance(res, compat_str)
+    return res
+
+
+def write_string(s, out=None):
+    if out is None:
+        out = sys.stderr
+    assert type(s) == type(u'')
+
+    if ('b' in getattr(out, 'mode', '') or
+            sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
+        s = s.encode(preferredencoding(), 'ignore')
+    out.write(s)
+    out.flush()
+
+
+def bytes_to_intlist(bs):
+    if not bs:
+        return []
+    if isinstance(bs[0], int):  # Python 3
+        return list(bs)
+    else:
+        return [ord(c) for c in bs]
+
+
+def intlist_to_bytes(xs):
+    if not xs:
+        return b''
+    if isinstance(chr(0), bytes):  # Python 2
+        return ''.join([chr(x) for x in xs])
+    else:
+        return bytes(xs)
+
+
+def get_cachedir(params={}):
+    cache_root = os.environ.get('XDG_CACHE_HOME',
+                                os.path.expanduser('~/.cache'))
+    return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
+
+
+# Cross-platform file locking
+if sys.platform == 'win32':
+    import ctypes.wintypes
+    import msvcrt
+
+    class OVERLAPPED(ctypes.Structure):
+        _fields_ = [
+            ('Internal', ctypes.wintypes.LPVOID),
+            ('InternalHigh', ctypes.wintypes.LPVOID),
+            ('Offset', ctypes.wintypes.DWORD),
+            ('OffsetHigh', ctypes.wintypes.DWORD),
+            ('hEvent', ctypes.wintypes.HANDLE),
+        ]
+
+    kernel32 = ctypes.windll.kernel32
+    LockFileEx = kernel32.LockFileEx
+    LockFileEx.argtypes = [
+        ctypes.wintypes.HANDLE,     # hFile
+        ctypes.wintypes.DWORD,      # dwFlags
+        ctypes.wintypes.DWORD,      # dwReserved
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
+        ctypes.POINTER(OVERLAPPED)  # Overlapped
+    ]
+    LockFileEx.restype = ctypes.wintypes.BOOL
+    UnlockFileEx = kernel32.UnlockFileEx
+    UnlockFileEx.argtypes = [
+        ctypes.wintypes.HANDLE,     # hFile
+        ctypes.wintypes.DWORD,      # dwReserved
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
+        ctypes.POINTER(OVERLAPPED)  # Overlapped
+    ]
+    UnlockFileEx.restype = ctypes.wintypes.BOOL
+    whole_low = 0xffffffff
+    whole_high = 0x7fffffff
+
+    def _lock_file(f, exclusive):
+        overlapped = OVERLAPPED()
+        overlapped.Offset = 0
+        overlapped.OffsetHigh = 0
+        overlapped.hEvent = 0
+        f._lock_file_overlapped_p = ctypes.pointer(overlapped)
+        handle = msvcrt.get_osfhandle(f.fileno())
+        if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
+                          whole_low, whole_high, f._lock_file_overlapped_p):
+            raise OSError('Locking file failed: %r' % ctypes.FormatError())
+
+    def _unlock_file(f):
+        assert f._lock_file_overlapped_p
+        handle = msvcrt.get_osfhandle(f.fileno())
+        if not UnlockFileEx(handle, 0,
+                            whole_low, whole_high, f._lock_file_overlapped_p):
+            raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
+
+else:
+    import fcntl
+
+    def _lock_file(f, exclusive):
+        fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+
+    def _unlock_file(f):
+        fcntl.lockf(f, fcntl.LOCK_UN)
+
+
+class locked_file(object):
+    def __init__(self, filename, mode, encoding=None):
+        assert mode in ['r', 'a', 'w']
+        self.f = io.open(filename, mode, encoding=encoding)
+        self.mode = mode
+
+    def __enter__(self):
+        exclusive = self.mode != 'r'
+        try:
+            _lock_file(self.f, exclusive)
+        except IOError:
+            self.f.close()
+            raise
+        return self
+
+    def __exit__(self, etype, value, traceback):
+        try:
+            _unlock_file(self.f)
+        finally:
+            self.f.close()
+
+    def __iter__(self):
+        return iter(self.f)
+
+    def write(self, *args):
+        return self.f.write(*args)
+
+    def read(self, *args):
+        return self.f.read(*args)