#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
import calendar
import codecs
import contextlib
import datetime
import email.utils
import errno
-import getpass
import gzip
import itertools
import io
import xml.etree.ElementTree
import zlib
-try:
- import urllib.request as compat_urllib_request
-except ImportError: # Python 2
- import urllib2 as compat_urllib_request
-
-try:
- import urllib.error as compat_urllib_error
-except ImportError: # Python 2
- import urllib2 as compat_urllib_error
-
-try:
- import urllib.parse as compat_urllib_parse
-except ImportError: # Python 2
- import urllib as compat_urllib_parse
-
-try:
- from urllib.parse import urlparse as compat_urllib_parse_urlparse
-except ImportError: # Python 2
- from urlparse import urlparse as compat_urllib_parse_urlparse
-
-try:
- import urllib.parse as compat_urlparse
-except ImportError: # Python 2
- import urlparse as compat_urlparse
-
-try:
- import http.cookiejar as compat_cookiejar
-except ImportError: # Python 2
- import cookielib as compat_cookiejar
-
-try:
- import html.entities as compat_html_entities
-except ImportError: # Python 2
- import htmlentitydefs as compat_html_entities
-
-try:
- import html.parser as compat_html_parser
-except ImportError: # Python 2
- import HTMLParser as compat_html_parser
-
-try:
- import http.client as compat_http_client
-except ImportError: # Python 2
- import httplib as compat_http_client
-
-try:
- from urllib.error import HTTPError as compat_HTTPError
-except ImportError: # Python 2
- from urllib2 import HTTPError as compat_HTTPError
-
-try:
- from urllib.request import urlretrieve as compat_urlretrieve
-except ImportError: # Python 2
- from urllib import urlretrieve as compat_urlretrieve
-
-
-try:
- from subprocess import DEVNULL
- compat_subprocess_get_DEVNULL = lambda: DEVNULL
-except ImportError:
- compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
-
-try:
- from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
- def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
- if string == '':
- return string
- res = string.split('%')
- if len(res) == 1:
- return string
- if encoding is None:
- encoding = 'utf-8'
- if errors is None:
- errors = 'replace'
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- pct_sequence = b''
- string = res[0]
- for item in res[1:]:
- try:
- if not item:
- raise ValueError
- pct_sequence += item[:2].decode('hex')
- rest = item[2:]
- if not rest:
- # This segment was just a single percent-encoded character.
- # May be part of a sequence of code units, so delay decoding.
- # (Stored in pct_sequence).
- continue
- except ValueError:
- rest = '%' + item
- # Encountered non-percent-encoded characters. Flush the current
- # pct_sequence.
- string += pct_sequence.decode(encoding, errors) + rest
- pct_sequence = b''
- if pct_sequence:
- # Flush the final pct_sequence
- string += pct_sequence.decode(encoding, errors)
- return string
-
-
-try:
- from urllib.parse import parse_qs as compat_parse_qs
-except ImportError: # Python 2
- # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
- # Python 2's version is apparently totally broken
-
- def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- qs, _coerce_result = qs, unicode
- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
- r = []
- for name_value in pairs:
- if not name_value and not strict_parsing:
- continue
- nv = name_value.split('=', 1)
- if len(nv) != 2:
- if strict_parsing:
- raise ValueError("bad query field: %r" % (name_value,))
- # Handle case of a control-name with no equal sign
- if keep_blank_values:
- nv.append('')
- else:
- continue
- if len(nv[1]) or keep_blank_values:
- name = nv[0].replace('+', ' ')
- name = compat_urllib_parse_unquote(
- name, encoding=encoding, errors=errors)
- name = _coerce_result(name)
- value = nv[1].replace('+', ' ')
- value = compat_urllib_parse_unquote(
- value, encoding=encoding, errors=errors)
- value = _coerce_result(value)
- r.append((name, value))
- return r
-
- def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- parsed_result = {}
- pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
- encoding=encoding, errors=errors)
- for name, value in pairs:
- if name in parsed_result:
- parsed_result[name].append(value)
- else:
- parsed_result[name] = [value]
- return parsed_result
-
-try:
- compat_str = unicode # Python 2
-except NameError:
- compat_str = str
-
-try:
- compat_chr = unichr # Python 2
-except NameError:
- compat_chr = chr
-
-try:
- from xml.etree.ElementTree import ParseError as compat_xml_parse_error
-except ImportError: # Python 2.6
- from xml.parsers.expat import ExpatError as compat_xml_parse_error
-
-try:
- from shlex import quote as shlex_quote
-except ImportError: # Python < 3.3
- def shlex_quote(s):
- return "'" + s.replace("'", "'\"'\"'") + "'"
-
-
-def compat_ord(c):
- if type(c) is int: return c
- else: return ord(c)
-
-
-# Environment variables should be decoded with filesystem encoding
-# otherwise this results in issues like #3854 #2918 #3217
-if sys.version_info >= (3, 0):
- compat_getenv = os.getenv
- compat_expanduser = os.path.expanduser
-else:
- def compat_getenv(key, default=None):
- env = os.getenv(key, default)
- if env:
- env = env.decode(get_filesystem_encoding())
- return env
-
- def compat_expanduser(path):
- """Expand ~ and ~user constructs.
-
- If user or $HOME is unknown, do nothing."""
- if path[:1] != '~':
- return path
- i, n = 1, len(path)
- while i < n and path[i] not in '/\\':
- i += 1
-
- if 'HOME' in os.environ:
- userhome = compat_getenv('HOME')
- elif 'USERPROFILE' in os.environ:
- userhome = compat_getenv('USERPROFILE')
- elif not 'HOMEPATH' in os.environ:
- return path
- else:
- try:
- drive = compat_getenv('HOMEDRIVE')
- except KeyError:
- drive = ''
- userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
-
- if i != 1: # ~user
- userhome = os.path.join(os.path.dirname(userhome), path[1:i])
-
- return userhome + path[i:]
+from .compat import (
+ compat_chr,
+ compat_getenv,
+ compat_html_entities,
+ compat_html_parser,
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urlparse,
+)
# This is not clearly defined otherwise
return pref
-if sys.version_info < (3,0):
- def compat_print(s):
- print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
-else:
- def compat_print(s):
- assert type(s) == type(u'')
- print(s)
-
def write_json_file(obj, fn):
""" Encode obj as JSON and write it to fn, atomically """
return n.text
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
- def __init(self):
- compat_html_parser.HTMLParser.__init__(self)
- self.html = None
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
-class AttrParser(BaseHTMLParser):
- """Modified HTMLParser that isolates a tag with the specified attribute"""
- def __init__(self, attribute, value):
- self.attribute = attribute
- self.value = value
- self.result = None
- self.started = False
- self.depth = {}
- self.watch_startpos = False
- self.error_count = 0
- BaseHTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise compat_html_parser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if self.attribute in attrs and attrs[self.attribute] == self.value:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
- AttrParser.parse_endtag = (lambda self, i:
- i + len("</scr'+'ipt>")
- if self.rawdata[i:].startswith("</scr'+'ipt>")
- else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html)
+
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
- parser = AttrParser(attribute, value)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
-class MetaParser(BaseHTMLParser):
- """
- Modified HTMLParser that isolates a meta tag with the specified name
- attribute.
- """
- def __init__(self, name):
- BaseHTMLParser.__init__(self)
- self.name = name
- self.content = None
- self.result = None
-
- def handle_starttag(self, tag, attrs):
- if tag != 'meta':
- return
- attrs = dict(attrs)
- if attrs.get('name') == self.name:
- self.result = attrs.get('content')
+ m = re.search(r'''(?xs)
+ <([a-zA-Z0-9:._-]+)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s+%s=['"]?%s['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (re.escape(attribute), re.escape(value)), html)
- def get_result(self):
- return self.result
+ if not m:
+ return None
+ res = m.group('content')
-def get_meta_content(name, html):
- """
- Return the content attribute from the meta tag with the given name attribute.
- """
- parser = MetaParser(name)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
+ if res.startswith('"') or res.startswith("'"):
+ res = res[1:-1]
+
+ return unescapeHTML(res)
def clean_html(html):
expected = True
if video_id is not None:
msg = video_id + ': ' + msg
+ if cause:
+ msg += u' (caused by %r)' % cause
if not expected:
msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
super(ExtractorError, self).__init__(msg)
return None
m = re.search(
- r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
+ r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
date_str)
if not m:
timezone = datetime.timedelta()
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
- date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+ date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
dt = datetime.datetime.strptime(date_str, date_format) - timezone
return calendar.timegm(dt.timetuple())
'%Y/%m/%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
+ '%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
'%d.%m.%Y %H.%M',
'%Y-%m-%dT%H:%M:%SZ',
return exe
+def get_exe_version(exe, args=['--version'],
+ version_re=r'version\s+([0-9._-a-zA-Z]+)',
+ unrecognized=u'present'):
+ """ Returns the version of the specified executable,
+ or False if the executable is not present """
+ try:
+ out, err = subprocess.Popen(
+ [exe] + args,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
+ except OSError:
+ return False
+ firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
+ m = re.search(version_re, firstline)
+ if m:
+ return m.group(1)
+ else:
+ return unrecognized
+
+
class PagedList(object):
def __len__(self):
# This is only useful for tests
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, unicode):
s = s.encode('utf-8')
- return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
def escape_url(url):
return tree
-if sys.version_info < (3, 0) and sys.platform == 'win32':
- def compat_getpass(prompt, *args, **kwargs):
- if isinstance(prompt, compat_str):
- prompt = prompt.encode(preferredencoding())
- return getpass.getpass(prompt, *args, **kwargs)
-else:
- compat_getpass = getpass.getpass
-
-
US_RATINGS = {
'G': 0,
'PG': 10,
}
+def parse_age_limit(s):
+ if s is None:
+ return None
+ m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
+ return int(m.group('age')) if m else US_RATINGS.get(s, None)
+
+
def strip_jsonp(code):
return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
def js_to_json(code):
def fix_kv(m):
- key = m.group(2)
- if key.startswith("'"):
- assert key.endswith("'")
- assert '"' not in key
- key = '"%s"' % key[1:-1]
- elif not key.startswith('"'):
- key = '"%s"' % key
-
- value = m.group(4)
- if value.startswith("'"):
- assert value.endswith("'")
- assert '"' not in value
- value = '"%s"' % value[1:-1]
-
- return m.group(1) + key + m.group(3) + value
+ v = m.group(0)
+ if v in ('true', 'false', 'null'):
+ return v
+ if v.startswith('"'):
+ return v
+ if v.startswith("'"):
+ v = v[1:-1]
+ v = re.sub(r"\\\\|\\'|\"", lambda m: {
+ '\\\\': '\\\\',
+ "\\'": "'",
+ '"': '\\"',
+ }[m.group(0)], v)
+ return '"%s"' % v
res = re.sub(r'''(?x)
- ([{,]\s*)
- ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
- (:\s*)
- ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
+ "(?:[^"\\]*(?:\\\\|\\")?)*"|
+ '(?:[^'\\]*(?:\\\\|\\')?)*'|
+ [a-zA-Z_][a-zA-Z_0-9]*
''', fix_kv, code)
res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
return res
DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
-try:
- subprocess_check_output = subprocess.check_output
-except AttributeError:
- def subprocess_check_output(*args, **kwargs):
- assert 'input' not in kwargs
- p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
- output, _ = p.communicate()
- ret = p.poll()
- if ret:
- raise subprocess.CalledProcessError(ret, p.args, output=output)
- return output
-
def limit_length(s, length):
""" Add ellipses to overly long strings """
if len(s) > length:
return s[:length - len(ELLIPSES)] + ELLIPSES
return s
+
+
+def version_tuple(v):
+ return [int(e) for e in v.split('.')]
+
+
+def is_outdated_version(version, limit, assume_new=True):
+ if not version:
+ return not assume_new
+ try:
+ return version_tuple(version) < version_tuple(limit)
+ except ValueError:
+ return not assume_new