2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import parse_qs as compat_parse_qs
95 except ImportError: # Python 2
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string, encoding='utf-8', errors='replace'):
101 res = string.split('%')
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
115 pct_sequence += item[:2].decode('hex')
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
124 # Encountered non-percent-encoded characters. Flush the current
126 string += pct_sequence.decode(encoding, errors) + rest
129 # Flush the final pct_sequence
130 string += pct_sequence.decode(encoding, errors)
133 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
134 encoding='utf-8', errors='replace'):
135 qs, _coerce_result = qs, unicode
136 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
138 for name_value in pairs:
139 if not name_value and not strict_parsing:
141 nv = name_value.split('=', 1)
144 raise ValueError("bad query field: %r" % (name_value,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values:
150 if len(nv[1]) or keep_blank_values:
151 name = nv[0].replace('+', ' ')
152 name = _unquote(name, encoding=encoding, errors=errors)
153 name = _coerce_result(name)
154 value = nv[1].replace('+', ' ')
155 value = _unquote(value, encoding=encoding, errors=errors)
156 value = _coerce_result(value)
157 r.append((name, value))
160 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
161 encoding='utf-8', errors='replace'):
163 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
164 encoding=encoding, errors=errors)
165 for name, value in pairs:
166 if name in parsed_result:
167 parsed_result[name].append(value)
169 parsed_result[name] = [value]
173 compat_str = unicode # Python 2
178 compat_chr = unichr # Python 2
183 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
184 except ImportError: # Python 2.6
185 from xml.parsers.expat import ExpatError as compat_xml_parse_error
188 if type(c) is int: return c
191 # This is not clearly defined otherwise
192 compiled_regex_type = type(re.compile(''))
195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
202 def preferredencoding():
203 """Get preferred encoding.
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
209 pref = locale.getpreferredencoding()
216 if sys.version_info < (3,0):
218 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
221 assert type(s) == type(u'')
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3,0):
227 def write_json_file(obj, fn):
228 with open(fn, 'wb') as f:
231 def write_json_file(obj, fn):
232 with open(fn, 'w', encoding='utf-8') as f:
235 if sys.version_info >= (2,7):
236 def find_xpath_attr(node, xpath, key, val):
237 """ Find the xpath xpath[@key=val] """
238 assert re.match(r'^[a-zA-Z]+$', key)
239 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
240 expr = xpath + u"[@%s='%s']" % (key, val)
241 return node.find(expr)
243 def find_xpath_attr(node, xpath, key, val):
244 for f in node.findall(xpath):
245 if f.attrib.get(key) == val:
249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
250 # the namespace parameter
251 def xpath_with_ns(path, ns_map):
252 components = [c.split(':') for c in path.split('/')]
256 replaced.append(c[0])
259 replaced.append('{%s}%s' % (ns_map[ns], tag))
260 return '/'.join(replaced)
262 def htmlentity_transform(matchobj):
263 """Transforms an HTML entity to a character.
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
268 entity = matchobj.group(1)
270 # Known non-numeric HTML entity
271 if entity in compat_html_entities.name2codepoint:
272 return compat_chr(compat_html_entities.name2codepoint[entity])
274 mobj = re.match(u'(?u)#(x?\\d+)', entity)
276 numstr = mobj.group(1)
277 if numstr.startswith(u'x'):
279 numstr = u'0%s' % numstr
282 return compat_chr(int(numstr, base))
284 # Unknown entity in name, return its literal representation
285 return (u'&%s;' % entity)
287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
288 class BaseHTMLParser(compat_html_parser.HTMLParser):
290 compat_html_parser.HTMLParser.__init__(self)
293 def loads(self, html):
298 class AttrParser(BaseHTMLParser):
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self, attribute, value):
301 self.attribute = attribute
306 self.watch_startpos = False
308 BaseHTMLParser.__init__(self)
310 def error(self, message):
311 if self.error_count > 10 or self.started:
312 raise compat_html_parser.HTMLParseError(message, self.getpos())
313 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
314 self.error_count += 1
317 def handle_starttag(self, tag, attrs):
320 self.find_startpos(None)
321 if self.attribute in attrs and attrs[self.attribute] == self.value:
324 self.watch_startpos = True
326 if not tag in self.depth: self.depth[tag] = 0
329 def handle_endtag(self, tag):
331 if tag in self.depth: self.depth[tag] -= 1
332 if self.depth[self.result[0]] == 0:
334 self.result.append(self.getpos())
336 def find_startpos(self, x):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self.watch_startpos:
340 self.watch_startpos = False
341 self.result.append(self.getpos())
342 handle_entityref = handle_charref = handle_data = handle_comment = \
343 handle_decl = handle_pi = unknown_decl = find_startpos
345 def get_result(self):
346 if self.result is None:
348 if len(self.result) != 3:
350 lines = self.html.split('\n')
351 lines = lines[self.result[1][0]-1:self.result[2][0]]
352 lines[0] = lines[0][self.result[1][1]:]
354 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
355 lines[-1] = lines[-1][:self.result[2][1]]
356 return '\n'.join(lines).strip()
357 # Hack for https://github.com/rg3/youtube-dl/issues/662
358 if sys.version_info < (2, 7, 3):
359 AttrParser.parse_endtag = (lambda self, i:
360 i + len("</scr'+'ipt>")
361 if self.rawdata[i:].startswith("</scr'+'ipt>")
362 else compat_html_parser.HTMLParser.parse_endtag(self, i))
364 def get_element_by_id(id, html):
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html)
368 def get_element_by_attribute(attribute, value, html):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser = AttrParser(attribute, value)
373 except compat_html_parser.HTMLParseError:
375 return parser.get_result()
377 class MetaParser(BaseHTMLParser):
379 Modified HTMLParser that isolates a meta tag with the specified name
382 def __init__(self, name):
383 BaseHTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
392 if attrs.get('name') == self.name:
393 self.result = attrs.get('content')
395 def get_result(self):
398 def get_meta_content(name, html):
400 Return the content attribute from the meta tag with the given name attribute.
402 parser = MetaParser(name)
405 except compat_html_parser.HTMLParseError:
407 return parser.get_result()
410 def clean_html(html):
411 """Clean an HTML snippet into a readable string"""
413 html = html.replace('\n', ' ')
414 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
415 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
417 html = re.sub('<.*?>', '', html)
418 # Replace html entities
419 html = unescapeHTML(html)
423 def sanitize_open(filename, open_mode):
424 """Try to open the given filename, and slightly tweak it if this fails.
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
431 It returns the tuple (stream, definitive_file_name).
435 if sys.platform == 'win32':
437 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
438 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, filename)
441 except (IOError, OSError) as err:
442 if err.errno in (errno.EACCES,):
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename = os.path.join(
447 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
448 for path_part in os.path.split(filename)
450 if alt_filename == filename:
453 # An exception here should be caught in the caller
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, alt_filename)
458 def timeconvert(timestr):
459 """Convert RFC 2822 defined time string into system timestamp"""
461 timetuple = email.utils.parsedate_tz(timestr)
462 if timetuple is not None:
463 timestamp = email.utils.mktime_tz(timetuple)
466 def sanitize_filename(s, restricted=False, is_id=False):
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
471 def replace_insane(char):
472 if char == '?' or ord(char) < 32 or ord(char) == 127:
475 return '' if restricted else '\''
477 return '_-' if restricted else ' -'
478 elif char in '\\/|*<>':
480 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
482 if restricted and ord(char) > 127:
486 result = u''.join(map(replace_insane, s))
488 while '__' in result:
489 result = result.replace('__', '_')
490 result = result.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted and result.startswith('-_'):
498 def orderedSet(iterable):
499 """ Remove all duplicates from the input iterable """
510 assert type(s) == compat_str
512 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
516 def encodeFilename(s, for_subprocess=False):
518 @param s The name of the file
521 assert type(s) == compat_str
523 # Python 3 has a Unicode API
524 if sys.version_info >= (3, 0):
527 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
531 if not for_subprocess:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding = preferredencoding()
538 encoding = sys.getfilesystemencoding()
541 return s.encode(encoding, 'ignore')
543 def decodeOption(optval):
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
549 assert isinstance(optval, compat_str)
552 def formatSeconds(secs):
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
556 return '%d:%02d' % (secs // 60, secs % 60)
561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
562 if sys.version_info < (3, 2):
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
570 sock = socket.create_connection((self.host, self.port), self.timeout)
571 if getattr(self, '_tunnel_host', False):
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
582 return HTTPSHandlerV3(**kwargs)
584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
585 context.verify_mode = (ssl.CERT_NONE
586 if opts_no_check_certificate
587 else ssl.CERT_REQUIRED)
588 context.set_default_verify_paths()
590 context.load_default_certs()
591 except AttributeError:
593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
595 class ExtractorError(Exception):
596 """Error during info extraction."""
597 def __init__(self, msg, tb=None, expected=False, cause=None):
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
606 super(ExtractorError, self).__init__(msg)
609 self.exc_info = sys.exc_info() # preserve original exception
612 def format_traceback(self):
613 if self.traceback is None:
615 return u''.join(traceback.format_tb(self.traceback))
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
623 class DownloadError(Exception):
624 """Download Error exception.
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
636 class SameFileError(Exception):
637 """Same File exception.
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
645 class PostProcessingError(Exception):
646 """Post Processing exception.
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
651 def __init__(self, msg):
654 class MaxDownloadsReached(Exception):
655 """ --max-downloads limit has been reached. """
659 class UnavailableVideoError(Exception):
660 """Unavailable Format exception.
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
668 class ContentTooShortError(Exception):
669 """Content Too Short exception.
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
684 """Handler for HTTP requests and responses.
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
693 Part of this code was copied from:
695 http://techknack.net/python-urllib2-handlers/
697 Andrew Rowls, the author of that code, agreed to release it to the
704 return zlib.decompress(data, -zlib.MAX_WBITS)
706 return zlib.decompress(data)
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
716 def http_request(self, req):
717 for h,v in std_headers.items():
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
725 if 'Youtubedl-user-agent' in req.headers:
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
729 del req.headers['Youtubedl-user-agent']
732 def http_response(self, req, resp):
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
761 https_request = http_request
762 https_response = http_response
765 def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
775 timezone = datetime.timedelta()
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
790 def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
798 date_str = date_str.replace(',', ' ')
799 # %z (UTC offset) is only supported in python>=3.2
800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
801 format_expressions = [
813 '%Y-%m-%dT%H:%M:%SZ',
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
817 '%Y-%m-%dT%H:%M:%S.%f',
820 for expression in format_expressions:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
831 def determine_ext(url, default_ext=u'unknown_video'):
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
838 def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
841 def date_from_str(date_str):
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
854 unit = match.group('unit')
863 delta = datetime.timedelta(**{unit: time})
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
867 def hyphenate_date(date_str):
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
876 class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
883 self.start = datetime.datetime.min.date()
885 self.end = date_from_str(end)
887 self.end = datetime.datetime.max.date()
888 if self.start > self.end:
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
892 """Returns a range that only contains the given day"""
894 def __contains__(self, date):
895 """Check if the date is in the range"""
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
909 assert isinstance(res, compat_str)
913 def write_string(s, out=None, encoding=None):
916 assert type(s) == compat_str
918 if ('b' in getattr(out, 'mode', '') or
919 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
920 byt = s.encode(encoding or preferredencoding(), 'ignore')
922 elif hasattr(out, 'buffer'):
923 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
924 byt = s.encode(enc, 'ignore')
925 out.buffer.write(byt)
931 def bytes_to_intlist(bs):
934 if isinstance(bs[0], int): # Python 3
937 return [ord(c) for c in bs]
940 def intlist_to_bytes(xs):
943 if isinstance(chr(0), bytes): # Python 2
944 return ''.join([chr(x) for x in xs])
949 def get_cachedir(params={}):
950 cache_root = os.environ.get('XDG_CACHE_HOME',
951 os.path.expanduser('~/.cache'))
952 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
955 # Cross-platform file locking
956 if sys.platform == 'win32':
957 import ctypes.wintypes
960 class OVERLAPPED(ctypes.Structure):
962 ('Internal', ctypes.wintypes.LPVOID),
963 ('InternalHigh', ctypes.wintypes.LPVOID),
964 ('Offset', ctypes.wintypes.DWORD),
965 ('OffsetHigh', ctypes.wintypes.DWORD),
966 ('hEvent', ctypes.wintypes.HANDLE),
969 kernel32 = ctypes.windll.kernel32
970 LockFileEx = kernel32.LockFileEx
971 LockFileEx.argtypes = [
972 ctypes.wintypes.HANDLE, # hFile
973 ctypes.wintypes.DWORD, # dwFlags
974 ctypes.wintypes.DWORD, # dwReserved
975 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
976 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
977 ctypes.POINTER(OVERLAPPED) # Overlapped
979 LockFileEx.restype = ctypes.wintypes.BOOL
980 UnlockFileEx = kernel32.UnlockFileEx
981 UnlockFileEx.argtypes = [
982 ctypes.wintypes.HANDLE, # hFile
983 ctypes.wintypes.DWORD, # dwReserved
984 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
985 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
986 ctypes.POINTER(OVERLAPPED) # Overlapped
988 UnlockFileEx.restype = ctypes.wintypes.BOOL
989 whole_low = 0xffffffff
990 whole_high = 0x7fffffff
992 def _lock_file(f, exclusive):
993 overlapped = OVERLAPPED()
994 overlapped.Offset = 0
995 overlapped.OffsetHigh = 0
996 overlapped.hEvent = 0
997 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
998 handle = msvcrt.get_osfhandle(f.fileno())
999 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1000 whole_low, whole_high, f._lock_file_overlapped_p):
1001 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1003 def _unlock_file(f):
1004 assert f._lock_file_overlapped_p
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not UnlockFileEx(handle, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1013 def _lock_file(f, exclusive):
1014 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1016 def _unlock_file(f):
1017 fcntl.lockf(f, fcntl.LOCK_UN)
1020 class locked_file(object):
1021 def __init__(self, filename, mode, encoding=None):
1022 assert mode in ['r', 'a', 'w']
1023 self.f = io.open(filename, mode, encoding=encoding)
1026 def __enter__(self):
1027 exclusive = self.mode != 'r'
1029 _lock_file(self.f, exclusive)
1035 def __exit__(self, etype, value, traceback):
1037 _unlock_file(self.f)
1044 def write(self, *args):
1045 return self.f.write(*args)
1047 def read(self, *args):
1048 return self.f.read(*args)
1051 def shell_quote(args):
1053 encoding = sys.getfilesystemencoding()
1054 if encoding is None:
1057 if isinstance(a, bytes):
1058 # We may get a filename encoded with 'encodeFilename'
1059 a = a.decode(encoding)
1060 quoted_args.append(pipes.quote(a))
1061 return u' '.join(quoted_args)
1064 def takewhile_inclusive(pred, seq):
1065 """ Like itertools.takewhile, but include the latest evaluated element
1066 (the first element so that Not pred(e)) """
1073 def smuggle_url(url, data):
1074 """ Pass additional data in a URL for internal use. """
1076 sdata = compat_urllib_parse.urlencode(
1077 {u'__youtubedl_smuggle': json.dumps(data)})
1078 return url + u'#' + sdata
1081 def unsmuggle_url(smug_url, default=None):
1082 if not '#__youtubedl_smuggle' in smug_url:
1083 return smug_url, default
1084 url, _, sdata = smug_url.rpartition(u'#')
1085 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1086 data = json.loads(jsond)
1090 def format_bytes(bytes):
1093 if type(bytes) is str:
1094 bytes = float(bytes)
1098 exponent = int(math.log(bytes, 1024.0))
1099 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1100 converted = float(bytes) / float(1024 ** exponent)
1101 return u'%.2f%s' % (converted, suffix)
1104 def str_to_int(int_str):
1105 int_str = re.sub(r'[,\.]', u'', int_str)
1109 def get_term_width():
1110 columns = os.environ.get('COLUMNS', None)
1115 sp = subprocess.Popen(
1117 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1118 out, err = sp.communicate()
1119 return int(out.split()[1])
1125 def month_by_name(name):
1126 """ Return the number of a month by (locale-independently) English name """
1129 u'January', u'February', u'March', u'April', u'May', u'June',
1130 u'July', u'August', u'September', u'October', u'November', u'December']
1132 return ENGLISH_NAMES.index(name) + 1
1137 def fix_xml_ampersands(xml_str):
1138 """Replace all the '&' by '&' in XML"""
1140 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1145 def setproctitle(title):
1146 assert isinstance(title, compat_str)
1148 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1151 title_bytes = title.encode('utf-8')
1152 buf = ctypes.create_string_buffer(len(title_bytes))
1153 buf.value = title_bytes
1155 libc.prctl(15, buf, 0, 0, 0)
1156 except AttributeError:
1157 return # Strange libc, just skip this
1160 def remove_start(s, start):
1161 if s.startswith(start):
1162 return s[len(start):]
1166 def url_basename(url):
1167 path = compat_urlparse.urlparse(url).path
1168 return path.strip(u'/').split(u'/')[-1]
1171 class HEADRequest(compat_urllib_request.Request):
1172 def get_method(self):
1176 def int_or_none(v, scale=1, default=None):
1177 return default if v is None else (int(v) // scale)
1180 def float_or_none(v, scale=1, default=None):
1181 return default if v is None else (float(v) / scale)
1184 def parse_duration(s):
1189 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1192 res = int(m.group('secs'))
1194 res += int(m.group('mins')) * 60
1195 if m.group('hours'):
1196 res += int(m.group('hours')) * 60 * 60
1200 def prepend_extension(filename, ext):
1201 name, real_ext = os.path.splitext(filename)
1202 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1205 def check_executable(exe, args=[]):
1206 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1207 args can be a list of arguments for a short output (like -version) """
1209 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1215 class PagedList(object):
1216 def __init__(self, pagefunc, pagesize):
1217 self._pagefunc = pagefunc
1218 self._pagesize = pagesize
1221 # This is only useful for tests
1222 return len(self.getslice())
1224 def getslice(self, start=0, end=None):
1226 for pagenum in itertools.count(start // self._pagesize):
1227 firstid = pagenum * self._pagesize
1228 nextfirstid = pagenum * self._pagesize + self._pagesize
1229 if start >= nextfirstid:
1232 page_results = list(self._pagefunc(pagenum))
1235 start % self._pagesize
1236 if firstid <= start < nextfirstid
1240 ((end - 1) % self._pagesize) + 1
1241 if (end is not None and firstid <= end <= nextfirstid)
1244 if startv != 0 or endv is not None:
1245 page_results = page_results[startv:endv]
1246 res.extend(page_results)
1248 # A little optimization - if current page is not "full", ie. does
1249 # not contain page_size videos then we can assume that this page
1250 # is the last one - there are no more ids on further pages -
1251 # i.e. no need to query again.
1252 if len(page_results) + startv < self._pagesize:
1255 # If we got the whole page, but the next page is not interesting,
1256 # break out early as well
1257 if end == nextfirstid:
1262 def uppercase_escape(s):
1263 unicode_escape = codecs.getdecoder('unicode_escape')
1265 r'\\U[0-9a-fA-F]{8}',
1266 lambda m: unicode_escape(m.group(0))[0],
1270 struct.pack(u'!I', 0)
1272 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1273 def struct_pack(spec, *args):
1274 if isinstance(spec, compat_str):
1275 spec = spec.encode('ascii')
1276 return struct.pack(spec, *args)
1278 def struct_unpack(spec, *args):
1279 if isinstance(spec, compat_str):
1280 spec = spec.encode('ascii')
1281 return struct.unpack(spec, *args)
1283 struct_pack = struct.pack
1284 struct_unpack = struct.unpack
1287 def read_batch_urls(batch_fd):
1289 if not isinstance(url, compat_str):
1290 url = url.decode('utf-8', 'replace')
1291 BOM_UTF8 = u'\xef\xbb\xbf'
1292 if url.startswith(BOM_UTF8):
1293 url = url[len(BOM_UTF8):]
1295 if url.startswith(('#', ';', ']')):
1299 with contextlib.closing(batch_fd) as fd:
1300 return [url for url in map(fixup, fd) if url]
1303 def urlencode_postdata(*args, **kargs):
1304 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1308 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1309 def doctype(self, name, pubid, system):
1310 pass # Ignore doctypes
1312 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1313 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1314 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1317 if sys.version_info < (3, 0) and sys.platform == 'win32':
1318 def compat_getpass(prompt, *args, **kwargs):
1319 if isinstance(prompt, compat_str):
1320 prompt = prompt.encode(preferredencoding())
1321 return getpass.getpass(prompt, *args, **kwargs)
1323 compat_getpass = getpass.getpass
1335 def strip_jsonp(code):
1336 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)