import ctypes
import datetime
import email.utils
+import email.header
import errno
import functools
import gzip
import math
import operator
import os
-import pipes
import platform
import random
import re
import zlib
from .compat import (
+ compat_HTMLParseError,
compat_HTMLParser,
compat_basestring,
compat_chr,
retlist = []
for m in re.finditer(r'''(?xs)
<([a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s+%s=['"]?%s['"]?
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*>
(?P<content>.*?)
</\1>
but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
"""
parser = HTMLAttributeParser()
- parser.feed(html_element)
- parser.close()
+ try:
+ parser.feed(html_element)
+ parser.close()
+ # Older Python may throw HTMLParseError in case of malformed HTML
+ except compat_HTMLParseError:
+ pass
return parser.attrs
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
- html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+ html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
assert type(s) == compat_str
return re.sub(
- r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+ r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding():
except zlib.error:
return zlib.decompress(data)
- @staticmethod
- def addinfourl_wrapper(stream, headers, url, code):
- if hasattr(compat_urllib_request.addinfourl, 'getcode'):
- return compat_urllib_request.addinfourl(stream, headers, url, code)
- ret = compat_urllib_request.addinfourl(stream, headers, url)
- ret.code = code
- return ret
-
def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
break
else:
raise original_ioerror
- resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
+ resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
del resp.headers['Content-encoding']
# deflate
if resp.headers.get('Content-encoding', '') == 'deflate':
gz = io.BytesIO(self.deflate(resp.read()))
- resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
+ resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
del resp.headers['Content-encoding']
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
if date_str is None:
return None
- date_str = date_str.replace(',', ' ')
+ date_str = re.sub(r'[,|]', '', date_str)
pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
timezone, date_str = extract_timezone(date_str)
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+ # Remove unrecognized timezones from ISO 8601 alike timestamps
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ if m:
+ date_str = date_str[:-len(m.group('tz'))]
+
for expression in date_formats(day_first):
try:
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
if isinstance(a, bytes):
# We may get a filename encoded with 'encodeFilename'
a = a.decode(encoding)
- quoted_args.append(pipes.quote(a))
+ quoted_args.append(compat_shlex_quote(a))
return ' '.join(quoted_args)
return default
+def bool_or_none(v, default=None):
+ return v if isinstance(v, bool) else default
+
+
def strip_or_none(v):
return None if v is None else v.strip()
return new_req
+def _multipart_encode_impl(data, boundary):
+ content_type = 'multipart/form-data; boundary=%s' % boundary
+
+ out = b''
+ for k, v in data.items():
+ out += b'--' + boundary.encode('ascii') + b'\r\n'
+ if isinstance(k, compat_str):
+ k = k.encode('utf-8')
+ if isinstance(v, compat_str):
+ v = v.encode('utf-8')
+ # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+ # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+ content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+ if boundary.encode('ascii') in content:
+ raise ValueError('Boundary overlaps with data')
+ out += content
+
+ out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+ return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+ '''
+ Encode a dict to RFC 7578-compliant form-data
+
+ data:
+ A dict where keys and values can be either Unicode or bytes-like
+ objects.
+ boundary:
+ If specified a Unicode object, it's used as the boundary. Otherwise
+ a random boundary is generated.
+
+ Reference: https://tools.ietf.org/html/rfc7578
+ '''
+ has_specified_boundary = boundary is not None
+
+ while True:
+ if boundary is None:
+ boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+ try:
+ out, content_type = _multipart_encode_impl(data, boundary)
+ break
+ except ValueError:
+ if has_specified_boundary:
+ raise
+ boundary = None
+
+ return out, content_type
+
+
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
def strip_jsonp(code):
return re.sub(
- r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+ r'''(?sx)^
+ (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
+ (?:\s*&&\s*(?P=func_name))?
+ \s*\(\s*(?P<callback_data>.*)\);?
+ \s*?(?://[^\n]*)*$''',
+ r'\g<callback_data>', code)
def js_to_json(code):
return {
'3gpp': '3gp',
'smptett+xml': 'tt',
- 'srt': 'srt',
'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
- 'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
'x-mpegurl': 'm3u8',
'vnd.apple.mpegurl': 'm3u8',
'dash+xml': 'mpd',
- 'f4m': 'f4m',
'f4m+xml': 'f4m',
'hds+xml': 'f4m',
'vnd.ms-sstr+xml': 'ism',
'quicktime': 'mov',
+ 'mp2t': 'ts',
}.get(res, res)
if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
if not vcodec:
vcodec = full_codec
- elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
+ elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
if not acodec:
acodec = full_codec
else:
- write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+ write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
if not vcodec and not acodec:
if len(splited_codecs) == 2:
return {
def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
param = params.get(param)
+ if param is None:
+ return []
assert isinstance(param, bool)
if separator:
return [command_option + separator + (true_value if param else false_value)]
"Couldn't find a tool to set the xattrs. "
"Install either the python 'xattr' module, "
"or the 'xattr' binary.")
+
+
+def cookie_to_dict(cookie):
+ cookie_dict = {
+ 'name': cookie.name,
+ 'value': cookie.value,
+ };
+ if cookie.port_specified:
+ cookie_dict['port'] = cookie.port
+ if cookie.domain_specified:
+ cookie_dict['domain'] = cookie.domain
+ if cookie.path_specified:
+ cookie_dict['path'] = cookie.path
+ if not cookie.expires is None:
+ cookie_dict['expires'] = cookie.expires
+ if not cookie.secure is None:
+ cookie_dict['secure'] = cookie.secure
+ if not cookie.discard is None:
+ cookie_dict['discard'] = cookie.discard
+ try:
+ if (cookie.has_nonstandard_attr('httpOnly') or
+ cookie.has_nonstandard_attr('httponly') or
+ cookie.has_nonstandard_attr('HttpOnly')):
+ cookie_dict['httponly'] = True
+ except TypeError:
+ pass
+ return cookie_dict
+
+
+def cookie_jar_to_list(cookie_jar):
+ return [cookie_to_dict(cookie) for cookie in cookie_jar]
+
+
+class PhantomJSwrapper(object):
+ """PhantomJS wrapper class"""
+
+ _TEMPLATE = r'''
+ phantom.onError = function(msg, trace) {{
+ var msgStack = ['PHANTOM ERROR: ' + msg];
+ if(trace && trace.length) {{
+ msgStack.push('TRACE:');
+ trace.forEach(function(t) {{
+ msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+ + (t.function ? ' (in function ' + t.function +')' : ''));
+ }});
+ }}
+ console.error(msgStack.join('\n'));
+ phantom.exit(1);
+ }};
+ var page = require('webpage').create();
+ var fs = require('fs');
+ var read = {{ mode: 'r', charset: 'utf-8' }};
+ var write = {{ mode: 'w', charset: 'utf-8' }};
+ JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
+ phantom.addCookie(x);
+ }});
+ page.settings.resourceTimeout = {timeout};
+ page.settings.userAgent = "{ua}";
+ page.onLoadStarted = function() {{
+ page.evaluate(function() {{
+ delete window._phantom;
+ delete window.callPhantom;
+ }});
+ }};
+ var saveAndExit = function() {{
+ fs.write("{html}", page.content, write);
+ fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
+ phantom.exit();
+ }};
+ page.onLoadFinished = function(status) {{
+ if(page.url === "") {{
+ page.setContent(fs.read("{html}", read), "{url}");
+ }}
+ else {{
+ {jscode}
+ }}
+ }};
+ page.open("");
+ '''
+
+ _TMP_FILE_NAMES = ['script', 'html', 'cookies']
+
+ @staticmethod
+ def _version():
+ return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
+
+ def __init__(self, extractor, required_version=None, timeout=10000):
+ self.exe = check_executable('phantomjs', ['-v'])
+ if not self.exe:
+ raise ExtractorError('PhantomJS executable not found in PATH, '
+ 'download it from http://phantomjs.org',
+ expected=True)
+
+ self.extractor = extractor
+
+ if required_version:
+ version = self._version()
+ if is_outdated_version(version, required_version):
+ self.extractor._downloader.report_warning(
+ 'Your copy of PhantomJS is outdated, update it to version '
+ '%s or newer if you encounter any errors.' % required_version)
+
+ self.options = {
+ 'timeout': timeout,
+ }
+ self._TMP_FILES = {}
+ for name in self._TMP_FILE_NAMES:
+ tmp = tempfile.NamedTemporaryFile(delete=False)
+ tmp.close()
+ self._TMP_FILES[name] = tmp
+
+ def __del__(self):
+ for name in self._TMP_FILE_NAMES:
+ try:
+ os.remove(self._TMP_FILES[name].name)
+ except:
+ pass
+
+ def _save_cookies(self, url):
+ cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
+ for cookie in cookies:
+ if 'path' not in cookie:
+ cookie['path'] = '/'
+ if 'domain' not in cookie:
+ cookie['domain'] = compat_urlparse.urlparse(url).netloc
+ with open(self._TMP_FILES['cookies'].name, 'wb') as f:
+ f.write(json.dumps(cookies).encode('utf-8'))
+
+ def _load_cookies(self):
+ with open(self._TMP_FILES['cookies'].name, 'rb') as f:
+ cookies = json.loads(f.read().decode('utf-8'))
+ for cookie in cookies:
+ if cookie['httponly'] is True:
+ cookie['rest'] = { 'httpOnly': None }
+ if 'expiry' in cookie:
+ cookie['expire_time'] = cookie['expiry']
+ self.extractor._set_cookie(**cookie)
+
+ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
+ """
+ Downloads webpage (if needed) and executes JS
+
+ Params:
+ url: website url
+ html: optional, html code of website
+ video_id: video id
+ note: optional, displayed when downloading webpage
+ note2: optional, displayed when executing JS
+ headers: custom http headers
+ jscode: code to be executed when page is loaded
+
+ Returns tuple with:
+ * downloaded website (after JS execution)
+ * anything you print with `console.log` (but not inside `page.execute`!)
+
+ In most cases you don't need to add any `jscode`.
+ It is executed in `page.onLoadFinished`.
+ `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
+ It is possible to wait for some element on the webpage, for example:
+ var check = function() {
+ var elementFound = page.evaluate(function() {
+ return document.querySelector('#b.done') !== null;
+ });
+ if(elementFound)
+ saveAndExit();
+ else
+ window.setTimeout(check, 500);
+ }
+
+ page.evaluate(function(){
+ document.querySelector('#a').click();
+ });
+ check();
+ """
+ if 'saveAndExit();' not in jscode:
+ raise ExtractorError('`saveAndExit();` not found in `jscode`')
+ if not html:
+ html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
+ with open(self._TMP_FILES['html'].name, 'wb') as f:
+ f.write(html.encode('utf-8'))
+
+ self._save_cookies(url)
+
+ replaces = self.options
+ replaces['url'] = url
+ user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+ replaces['ua'] = user_agent.replace('"', '\\"')
+ replaces['jscode'] = jscode
+
+ for x in self._TMP_FILE_NAMES:
+ replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
+
+ with open(self._TMP_FILES['script'].name, 'wb') as f:
+ f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
+
+ if video_id is None:
+ self.extractor.to_screen('%s' % (note2,))
+ else:
+ self.extractor.to_screen('%s: %s' % (video_id, note2))
+
+ p = subprocess.Popen([self.exe, '--ssl-protocol=any',
+ self._TMP_FILES['script'].name], stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ out, err = p.communicate()
+ if p.returncode != 0:
+ raise ExtractorError('Executing JS failed\n:'
+ + encodeArgument(err))
+ with open(self._TMP_FILES['html'].name, 'rb') as f:
+ html = f.read().decode('utf-8')
+
+ self._load_cookies()
+
+ return (html, encodeArgument(out))
+
+
+def random_birthday(year_field, month_field, day_field):
+ return {
+ year_field: str(random.randint(1950, 1995)),
+ month_field: str(random.randint(1, 12)),
+ day_field: str(random.randint(1, 31)),
+ }