2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import unquote as compat_urllib_parse_unquote
96 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
99 res = string.split('%')
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
113 pct_sequence += item[:2].decode('hex')
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
122 # Encountered non-percent-encoded characters. Flush the current
124 string += pct_sequence.decode(encoding, errors) + rest
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
133 from urllib.parse import parse_qs as compat_parse_qs
134 except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
138 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
139 encoding='utf-8', errors='replace'):
140 qs, _coerce_result = qs, unicode
141 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 for name_value in pairs:
144 if not name_value and not strict_parsing:
146 nv = name_value.split('=', 1)
149 raise ValueError("bad query field: %r" % (name_value,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values:
155 if len(nv[1]) or keep_blank_values:
156 name = nv[0].replace('+', ' ')
157 name = compat_urllib_parse_unquote(
158 name, encoding=encoding, errors=errors)
159 name = _coerce_result(name)
160 value = nv[1].replace('+', ' ')
161 value = compat_urllib_parse_unquote(
162 value, encoding=encoding, errors=errors)
163 value = _coerce_result(value)
164 r.append((name, value))
167 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
168 encoding='utf-8', errors='replace'):
170 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
171 encoding=encoding, errors=errors)
172 for name, value in pairs:
173 if name in parsed_result:
174 parsed_result[name].append(value)
176 parsed_result[name] = [value]
180 compat_str = unicode # Python 2
185 compat_chr = unichr # Python 2
190 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
191 except ImportError: # Python 2.6
192 from xml.parsers.expat import ExpatError as compat_xml_parse_error
195 if type(c) is int: return c
198 # This is not clearly defined otherwise
199 compiled_regex_type = type(re.compile(''))
202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
209 def preferredencoding():
210 """Get preferred encoding.
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
216 pref = locale.getpreferredencoding()
223 if sys.version_info < (3,0):
225 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
228 assert type(s) == type(u'')
231 # In Python 2.x, json.dump expects a bytestream.
232 # In Python 3.x, it writes to a character stream
233 if sys.version_info < (3,0):
234 def write_json_file(obj, fn):
235 with open(fn, 'wb') as f:
238 def write_json_file(obj, fn):
239 with open(fn, 'w', encoding='utf-8') as f:
242 if sys.version_info >= (2,7):
243 def find_xpath_attr(node, xpath, key, val):
244 """ Find the xpath xpath[@key=val] """
245 assert re.match(r'^[a-zA-Z-]+$', key)
246 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
247 expr = xpath + u"[@%s='%s']" % (key, val)
248 return node.find(expr)
250 def find_xpath_attr(node, xpath, key, val):
251 for f in node.findall(xpath):
252 if f.attrib.get(key) == val:
256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
257 # the namespace parameter
258 def xpath_with_ns(path, ns_map):
259 components = [c.split(':') for c in path.split('/')]
263 replaced.append(c[0])
266 replaced.append('{%s}%s' % (ns_map[ns], tag))
267 return '/'.join(replaced)
269 def htmlentity_transform(matchobj):
270 """Transforms an HTML entity to a character.
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
275 entity = matchobj.group(1)
277 # Known non-numeric HTML entity
278 if entity in compat_html_entities.name2codepoint:
279 return compat_chr(compat_html_entities.name2codepoint[entity])
281 mobj = re.match(u'(?u)#(x?\\d+)', entity)
283 numstr = mobj.group(1)
284 if numstr.startswith(u'x'):
286 numstr = u'0%s' % numstr
289 return compat_chr(int(numstr, base))
291 # Unknown entity in name, return its literal representation
292 return (u'&%s;' % entity)
294 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
295 class BaseHTMLParser(compat_html_parser.HTMLParser):
297 compat_html_parser.HTMLParser.__init__(self)
300 def loads(self, html):
305 class AttrParser(BaseHTMLParser):
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self, attribute, value):
308 self.attribute = attribute
313 self.watch_startpos = False
315 BaseHTMLParser.__init__(self)
317 def error(self, message):
318 if self.error_count > 10 or self.started:
319 raise compat_html_parser.HTMLParseError(message, self.getpos())
320 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
321 self.error_count += 1
324 def handle_starttag(self, tag, attrs):
327 self.find_startpos(None)
328 if self.attribute in attrs and attrs[self.attribute] == self.value:
331 self.watch_startpos = True
333 if not tag in self.depth: self.depth[tag] = 0
336 def handle_endtag(self, tag):
338 if tag in self.depth: self.depth[tag] -= 1
339 if self.depth[self.result[0]] == 0:
341 self.result.append(self.getpos())
343 def find_startpos(self, x):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self.watch_startpos:
347 self.watch_startpos = False
348 self.result.append(self.getpos())
349 handle_entityref = handle_charref = handle_data = handle_comment = \
350 handle_decl = handle_pi = unknown_decl = find_startpos
352 def get_result(self):
353 if self.result is None:
355 if len(self.result) != 3:
357 lines = self.html.split('\n')
358 lines = lines[self.result[1][0]-1:self.result[2][0]]
359 lines[0] = lines[0][self.result[1][1]:]
361 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
362 lines[-1] = lines[-1][:self.result[2][1]]
363 return '\n'.join(lines).strip()
364 # Hack for https://github.com/rg3/youtube-dl/issues/662
365 if sys.version_info < (2, 7, 3):
366 AttrParser.parse_endtag = (lambda self, i:
367 i + len("</scr'+'ipt>")
368 if self.rawdata[i:].startswith("</scr'+'ipt>")
369 else compat_html_parser.HTMLParser.parse_endtag(self, i))
371 def get_element_by_id(id, html):
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html)
375 def get_element_by_attribute(attribute, value, html):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser = AttrParser(attribute, value)
380 except compat_html_parser.HTMLParseError:
382 return parser.get_result()
384 class MetaParser(BaseHTMLParser):
386 Modified HTMLParser that isolates a meta tag with the specified name
389 def __init__(self, name):
390 BaseHTMLParser.__init__(self)
395 def handle_starttag(self, tag, attrs):
399 if attrs.get('name') == self.name:
400 self.result = attrs.get('content')
402 def get_result(self):
405 def get_meta_content(name, html):
407 Return the content attribute from the meta tag with the given name attribute.
409 parser = MetaParser(name)
412 except compat_html_parser.HTMLParseError:
414 return parser.get_result()
417 def clean_html(html):
418 """Clean an HTML snippet into a readable string"""
420 html = html.replace('\n', ' ')
421 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
422 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
424 html = re.sub('<.*?>', '', html)
425 # Replace html entities
426 html = unescapeHTML(html)
430 def sanitize_open(filename, open_mode):
431 """Try to open the given filename, and slightly tweak it if this fails.
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
438 It returns the tuple (stream, definitive_file_name).
442 if sys.platform == 'win32':
444 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
445 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
446 stream = open(encodeFilename(filename), open_mode)
447 return (stream, filename)
448 except (IOError, OSError) as err:
449 if err.errno in (errno.EACCES,):
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename = os.path.join(
454 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
455 for path_part in os.path.split(filename)
457 if alt_filename == filename:
460 # An exception here should be caught in the caller
461 stream = open(encodeFilename(filename), open_mode)
462 return (stream, alt_filename)
465 def timeconvert(timestr):
466 """Convert RFC 2822 defined time string into system timestamp"""
468 timetuple = email.utils.parsedate_tz(timestr)
469 if timetuple is not None:
470 timestamp = email.utils.mktime_tz(timetuple)
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
478 def replace_insane(char):
479 if char == '?' or ord(char) < 32 or ord(char) == 127:
482 return '' if restricted else '\''
484 return '_-' if restricted else ' -'
485 elif char in '\\/|*<>':
487 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
489 if restricted and ord(char) > 127:
493 result = u''.join(map(replace_insane, s))
495 while '__' in result:
496 result = result.replace('__', '_')
497 result = result.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted and result.startswith('-_'):
505 def orderedSet(iterable):
506 """ Remove all duplicates from the input iterable """
517 assert type(s) == compat_str
519 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
523 def encodeFilename(s, for_subprocess=False):
525 @param s The name of the file
528 assert type(s) == compat_str
530 # Python 3 has a Unicode API
531 if sys.version_info >= (3, 0):
534 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
538 if not for_subprocess:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding = preferredencoding()
545 encoding = sys.getfilesystemencoding()
548 return s.encode(encoding, 'ignore')
551 def encodeArgument(s):
552 if not isinstance(s, compat_str):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s = s.decode('ascii')
557 return encodeFilename(s, True)
560 def decodeOption(optval):
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
566 assert isinstance(optval, compat_str)
569 def formatSeconds(secs):
571 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
573 return '%d:%02d' % (secs // 60, secs % 60)
578 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
579 if sys.version_info < (3, 2):
582 class HTTPSConnectionV3(httplib.HTTPSConnection):
583 def __init__(self, *args, **kwargs):
584 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
587 sock = socket.create_connection((self.host, self.port), self.timeout)
588 if getattr(self, '_tunnel_host', False):
592 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
594 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
596 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
597 def https_open(self, req):
598 return self.do_open(HTTPSConnectionV3, req)
599 return HTTPSHandlerV3(**kwargs)
601 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
602 context.verify_mode = (ssl.CERT_NONE
603 if opts_no_check_certificate
604 else ssl.CERT_REQUIRED)
605 context.set_default_verify_paths()
607 context.load_default_certs()
608 except AttributeError:
610 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
612 class ExtractorError(Exception):
613 """Error during info extraction."""
614 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
619 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
621 if video_id is not None:
622 msg = video_id + ': ' + msg
624 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
625 super(ExtractorError, self).__init__(msg)
628 self.exc_info = sys.exc_info() # preserve original exception
630 self.video_id = video_id
632 def format_traceback(self):
633 if self.traceback is None:
635 return u''.join(traceback.format_tb(self.traceback))
638 class RegexNotFoundError(ExtractorError):
639 """Error when a regex didn't match"""
643 class DownloadError(Exception):
644 """Download Error exception.
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
650 def __init__(self, msg, exc_info=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError, self).__init__(msg)
653 self.exc_info = exc_info
656 class SameFileError(Exception):
657 """Same File exception.
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
665 class PostProcessingError(Exception):
666 """Post Processing exception.
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
671 def __init__(self, msg):
674 class MaxDownloadsReached(Exception):
675 """ --max-downloads limit has been reached. """
679 class UnavailableVideoError(Exception):
680 """Unavailable Format exception.
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
688 class ContentTooShortError(Exception):
689 """Content Too Short exception.
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
699 def __init__(self, downloaded, expected):
700 self.downloaded = downloaded
701 self.expected = expected
703 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
704 """Handler for HTTP requests and responses.
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
713 Part of this code was copied from:
715 http://techknack.net/python-urllib2-handlers/
717 Andrew Rowls, the author of that code, agreed to release it to the
724 return zlib.decompress(data, -zlib.MAX_WBITS)
726 return zlib.decompress(data)
729 def addinfourl_wrapper(stream, headers, url, code):
730 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
731 return compat_urllib_request.addinfourl(stream, headers, url, code)
732 ret = compat_urllib_request.addinfourl(stream, headers, url)
736 def http_request(self, req):
737 for h,v in std_headers.items():
741 if 'Youtubedl-no-compression' in req.headers:
742 if 'Accept-encoding' in req.headers:
743 del req.headers['Accept-encoding']
744 del req.headers['Youtubedl-no-compression']
745 if 'Youtubedl-user-agent' in req.headers:
746 if 'User-agent' in req.headers:
747 del req.headers['User-agent']
748 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
749 del req.headers['Youtubedl-user-agent']
752 def http_response(self, req, resp):
755 if resp.headers.get('Content-encoding', '') == 'gzip':
756 content = resp.read()
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
759 uncompressed = io.BytesIO(gz.read())
760 except IOError as original_ioerror:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i in range(1, 1024):
765 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
766 uncompressed = io.BytesIO(gz.read())
771 raise original_ioerror
772 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
773 resp.msg = old_resp.msg
775 if resp.headers.get('Content-encoding', '') == 'deflate':
776 gz = io.BytesIO(self.deflate(resp.read()))
777 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
778 resp.msg = old_resp.msg
781 https_request = http_request
782 https_response = http_response
785 def parse_iso8601(date_str, delimiter='T'):
786 """ Return a UNIX timestamp from the given date """
792 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
795 timezone = datetime.timedelta()
797 date_str = date_str[:-len(m.group(0))]
798 if not m.group('sign'):
799 timezone = datetime.timedelta()
801 sign = 1 if m.group('sign') == '+' else -1
802 timezone = datetime.timedelta(
803 hours=sign * int(m.group('hours')),
804 minutes=sign * int(m.group('minutes')))
805 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
806 dt = datetime.datetime.strptime(date_str, date_format) - timezone
807 return calendar.timegm(dt.timetuple())
810 def unified_strdate(date_str):
811 """Return a string with the date in the format YYYYMMDD"""
818 date_str = date_str.replace(',', ' ')
819 # %z (UTC offset) is only supported in python>=3.2
820 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
821 format_expressions = [
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
837 '%Y-%m-%dT%H:%M:%SZ',
838 '%Y-%m-%dT%H:%M:%S.%fZ',
839 '%Y-%m-%dT%H:%M:%S.%f0Z',
841 '%Y-%m-%dT%H:%M:%S.%f',
844 for expression in format_expressions:
846 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
849 if upload_date is None:
850 timetuple = email.utils.parsedate_tz(date_str)
852 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
855 def determine_ext(url, default_ext=u'unknown_video'):
858 guess = url.partition(u'?')[0].rpartition(u'.')[2]
859 if re.match(r'^[A-Za-z0-9]+$', guess):
864 def subtitles_filename(filename, sub_lang, sub_format):
865 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
867 def date_from_str(date_str):
869 Return a datetime object from a string in the format YYYYMMDD or
870 (now|today)[+-][0-9](day|week|month|year)(s)?"""
871 today = datetime.date.today()
872 if date_str == 'now'or date_str == 'today':
874 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
875 if match is not None:
876 sign = match.group('sign')
877 time = int(match.group('time'))
880 unit = match.group('unit')
889 delta = datetime.timedelta(**{unit: time})
891 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
893 def hyphenate_date(date_str):
895 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
896 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
897 if match is not None:
898 return '-'.join(match.groups())
902 class DateRange(object):
903 """Represents a time interval between two dates"""
904 def __init__(self, start=None, end=None):
905 """start and end must be strings in the format accepted by date"""
906 if start is not None:
907 self.start = date_from_str(start)
909 self.start = datetime.datetime.min.date()
911 self.end = date_from_str(end)
913 self.end = datetime.datetime.max.date()
914 if self.start > self.end:
915 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
918 """Returns a range that only contains the given day"""
920 def __contains__(self, date):
921 """Check if the date is in the range"""
922 if not isinstance(date, datetime.date):
923 date = date_from_str(date)
924 return self.start <= date <= self.end
926 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
930 """ Returns the platform name as a compat_str """
931 res = platform.platform()
932 if isinstance(res, bytes):
933 res = res.decode(preferredencoding())
935 assert isinstance(res, compat_str)
939 def _windows_write_string(s, out):
940 """ Returns True if the string was written using special methods,
941 False if it has yet to be written out."""
942 # Adapted from http://stackoverflow.com/a/3259271/35070
945 import ctypes.wintypes
953 fileno = out.fileno()
954 except AttributeError:
955 # If the output stream doesn't have a fileno, it's virtual
957 if fileno not in WIN_OUTPUT_IDS:
960 GetStdHandle = ctypes.WINFUNCTYPE(
961 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
962 ("GetStdHandle", ctypes.windll.kernel32))
963 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
965 WriteConsoleW = ctypes.WINFUNCTYPE(
966 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
967 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
968 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
969 written = ctypes.wintypes.DWORD(0)
971 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
972 FILE_TYPE_CHAR = 0x0002
973 FILE_TYPE_REMOTE = 0x8000
974 GetConsoleMode = ctypes.WINFUNCTYPE(
975 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
976 ctypes.POINTER(ctypes.wintypes.DWORD))(
977 ("GetConsoleMode", ctypes.windll.kernel32))
978 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
980 def not_a_console(handle):
981 if handle == INVALID_HANDLE_VALUE or handle is None:
983 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
984 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
989 def next_nonbmp_pos(s):
991 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
992 except StopIteration:
996 count = min(next_nonbmp_pos(s), 1024)
999 h, s, count if count else 2, ctypes.byref(written), None)
1001 raise OSError('Failed to write string')
1002 if not count: # We just wrote a non-BMP character
1003 assert written.value == 2
1006 assert written.value > 0
1007 s = s[written.value:]
1011 def write_string(s, out=None, encoding=None):
1014 assert type(s) == compat_str
1016 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1017 if _windows_write_string(s, out):
1020 if ('b' in getattr(out, 'mode', '') or
1021 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1022 byt = s.encode(encoding or preferredencoding(), 'ignore')
1024 elif hasattr(out, 'buffer'):
1025 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1026 byt = s.encode(enc, 'ignore')
1027 out.buffer.write(byt)
1033 def bytes_to_intlist(bs):
1036 if isinstance(bs[0], int): # Python 3
1039 return [ord(c) for c in bs]
1042 def intlist_to_bytes(xs):
1045 if isinstance(chr(0), bytes): # Python 2
1046 return ''.join([chr(x) for x in xs])
1051 def get_cachedir(params={}):
1052 cache_root = os.environ.get('XDG_CACHE_HOME',
1053 os.path.expanduser('~/.cache'))
1054 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1057 # Cross-platform file locking
1058 if sys.platform == 'win32':
1059 import ctypes.wintypes
1062 class OVERLAPPED(ctypes.Structure):
1064 ('Internal', ctypes.wintypes.LPVOID),
1065 ('InternalHigh', ctypes.wintypes.LPVOID),
1066 ('Offset', ctypes.wintypes.DWORD),
1067 ('OffsetHigh', ctypes.wintypes.DWORD),
1068 ('hEvent', ctypes.wintypes.HANDLE),
1071 kernel32 = ctypes.windll.kernel32
1072 LockFileEx = kernel32.LockFileEx
1073 LockFileEx.argtypes = [
1074 ctypes.wintypes.HANDLE, # hFile
1075 ctypes.wintypes.DWORD, # dwFlags
1076 ctypes.wintypes.DWORD, # dwReserved
1077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1078 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1079 ctypes.POINTER(OVERLAPPED) # Overlapped
1081 LockFileEx.restype = ctypes.wintypes.BOOL
1082 UnlockFileEx = kernel32.UnlockFileEx
1083 UnlockFileEx.argtypes = [
1084 ctypes.wintypes.HANDLE, # hFile
1085 ctypes.wintypes.DWORD, # dwReserved
1086 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1087 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1088 ctypes.POINTER(OVERLAPPED) # Overlapped
1090 UnlockFileEx.restype = ctypes.wintypes.BOOL
1091 whole_low = 0xffffffff
1092 whole_high = 0x7fffffff
1094 def _lock_file(f, exclusive):
1095 overlapped = OVERLAPPED()
1096 overlapped.Offset = 0
1097 overlapped.OffsetHigh = 0
1098 overlapped.hEvent = 0
1099 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1100 handle = msvcrt.get_osfhandle(f.fileno())
1101 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1102 whole_low, whole_high, f._lock_file_overlapped_p):
1103 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1105 def _unlock_file(f):
1106 assert f._lock_file_overlapped_p
1107 handle = msvcrt.get_osfhandle(f.fileno())
1108 if not UnlockFileEx(handle, 0,
1109 whole_low, whole_high, f._lock_file_overlapped_p):
1110 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1115 def _lock_file(f, exclusive):
1116 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1118 def _unlock_file(f):
1119 fcntl.lockf(f, fcntl.LOCK_UN)
1122 class locked_file(object):
1123 def __init__(self, filename, mode, encoding=None):
1124 assert mode in ['r', 'a', 'w']
1125 self.f = io.open(filename, mode, encoding=encoding)
1128 def __enter__(self):
1129 exclusive = self.mode != 'r'
1131 _lock_file(self.f, exclusive)
1137 def __exit__(self, etype, value, traceback):
1139 _unlock_file(self.f)
1146 def write(self, *args):
1147 return self.f.write(*args)
1149 def read(self, *args):
1150 return self.f.read(*args)
1153 def shell_quote(args):
1155 encoding = sys.getfilesystemencoding()
1156 if encoding is None:
1159 if isinstance(a, bytes):
1160 # We may get a filename encoded with 'encodeFilename'
1161 a = a.decode(encoding)
1162 quoted_args.append(pipes.quote(a))
1163 return u' '.join(quoted_args)
1166 def takewhile_inclusive(pred, seq):
1167 """ Like itertools.takewhile, but include the latest evaluated element
1168 (the first element so that Not pred(e)) """
1175 def smuggle_url(url, data):
1176 """ Pass additional data in a URL for internal use. """
1178 sdata = compat_urllib_parse.urlencode(
1179 {u'__youtubedl_smuggle': json.dumps(data)})
1180 return url + u'#' + sdata
1183 def unsmuggle_url(smug_url, default=None):
1184 if not '#__youtubedl_smuggle' in smug_url:
1185 return smug_url, default
1186 url, _, sdata = smug_url.rpartition(u'#')
1187 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1188 data = json.loads(jsond)
1192 def format_bytes(bytes):
1195 if type(bytes) is str:
1196 bytes = float(bytes)
1200 exponent = int(math.log(bytes, 1024.0))
1201 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1202 converted = float(bytes) / float(1024 ** exponent)
1203 return u'%.2f%s' % (converted, suffix)
1206 def get_term_width():
1207 columns = os.environ.get('COLUMNS', None)
1212 sp = subprocess.Popen(
1214 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1215 out, err = sp.communicate()
1216 return int(out.split()[1])
1222 def month_by_name(name):
1223 """ Return the number of a month by (locale-independently) English name """
1226 u'January', u'February', u'March', u'April', u'May', u'June',
1227 u'July', u'August', u'September', u'October', u'November', u'December']
1229 return ENGLISH_NAMES.index(name) + 1
1234 def fix_xml_ampersands(xml_str):
1235 """Replace all the '&' by '&' in XML"""
1237 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1242 def setproctitle(title):
1243 assert isinstance(title, compat_str)
1245 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 title_bytes = title.encode('utf-8')
1249 buf = ctypes.create_string_buffer(len(title_bytes))
1250 buf.value = title_bytes
1252 libc.prctl(15, buf, 0, 0, 0)
1253 except AttributeError:
1254 return # Strange libc, just skip this
1257 def remove_start(s, start):
1258 if s.startswith(start):
1259 return s[len(start):]
1263 def url_basename(url):
1264 path = compat_urlparse.urlparse(url).path
1265 return path.strip(u'/').split(u'/')[-1]
1268 class HEADRequest(compat_urllib_request.Request):
1269 def get_method(self):
1273 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1276 v = getattr(v, get_attr, None)
1279 return default if v is None else (int(v) * invscale // scale)
1282 def str_or_none(v, default=None):
1283 return default if v is None else compat_str(v)
1286 def str_to_int(int_str):
1289 int_str = re.sub(r'[,\.]', u'', int_str)
1293 def float_or_none(v, scale=1, invscale=1, default=None):
1294 return default if v is None else (float(v) * invscale / scale)
1297 def parse_duration(s):
1302 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1305 res = int(m.group('secs'))
1307 res += int(m.group('mins')) * 60
1308 if m.group('hours'):
1309 res += int(m.group('hours')) * 60 * 60
1313 def prepend_extension(filename, ext):
1314 name, real_ext = os.path.splitext(filename)
1315 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1318 def check_executable(exe, args=[]):
1319 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1320 args can be a list of arguments for a short output (like -version) """
1322 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1328 class PagedList(object):
1329 def __init__(self, pagefunc, pagesize):
1330 self._pagefunc = pagefunc
1331 self._pagesize = pagesize
1334 # This is only useful for tests
1335 return len(self.getslice())
1337 def getslice(self, start=0, end=None):
1339 for pagenum in itertools.count(start // self._pagesize):
1340 firstid = pagenum * self._pagesize
1341 nextfirstid = pagenum * self._pagesize + self._pagesize
1342 if start >= nextfirstid:
1345 page_results = list(self._pagefunc(pagenum))
1348 start % self._pagesize
1349 if firstid <= start < nextfirstid
1353 ((end - 1) % self._pagesize) + 1
1354 if (end is not None and firstid <= end <= nextfirstid)
1357 if startv != 0 or endv is not None:
1358 page_results = page_results[startv:endv]
1359 res.extend(page_results)
1361 # A little optimization - if current page is not "full", ie. does
1362 # not contain page_size videos then we can assume that this page
1363 # is the last one - there are no more ids on further pages -
1364 # i.e. no need to query again.
1365 if len(page_results) + startv < self._pagesize:
1368 # If we got the whole page, but the next page is not interesting,
1369 # break out early as well
1370 if end == nextfirstid:
1375 def uppercase_escape(s):
1376 unicode_escape = codecs.getdecoder('unicode_escape')
1378 r'\\U[0-9a-fA-F]{8}',
1379 lambda m: unicode_escape(m.group(0))[0],
1383 struct.pack(u'!I', 0)
1385 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1386 def struct_pack(spec, *args):
1387 if isinstance(spec, compat_str):
1388 spec = spec.encode('ascii')
1389 return struct.pack(spec, *args)
1391 def struct_unpack(spec, *args):
1392 if isinstance(spec, compat_str):
1393 spec = spec.encode('ascii')
1394 return struct.unpack(spec, *args)
1396 struct_pack = struct.pack
1397 struct_unpack = struct.unpack
1400 def read_batch_urls(batch_fd):
1402 if not isinstance(url, compat_str):
1403 url = url.decode('utf-8', 'replace')
1404 BOM_UTF8 = u'\xef\xbb\xbf'
1405 if url.startswith(BOM_UTF8):
1406 url = url[len(BOM_UTF8):]
1408 if url.startswith(('#', ';', ']')):
1412 with contextlib.closing(batch_fd) as fd:
1413 return [url for url in map(fixup, fd) if url]
1416 def urlencode_postdata(*args, **kargs):
1417 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1421 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1422 def doctype(self, name, pubid, system):
1423 pass # Ignore doctypes
1425 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1426 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1427 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1430 if sys.version_info < (3, 0) and sys.platform == 'win32':
1431 def compat_getpass(prompt, *args, **kwargs):
1432 if isinstance(prompt, compat_str):
1433 prompt = prompt.encode(preferredencoding())
1434 return getpass.getpass(prompt, *args, **kwargs)
1436 compat_getpass = getpass.getpass
1448 def strip_jsonp(code):
1449 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1452 def qualities(quality_ids):
1453 """ Get a numeric quality value out of a list of possible values """
1456 return quality_ids.index(qid)
1462 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1465 subprocess_check_output = subprocess.check_output
1466 except AttributeError:
1467 def subprocess_check_output(*args, **kwargs):
1468 assert 'input' not in kwargs
1469 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1470 output, _ = p.communicate()
1473 raise subprocess.CalledProcessError(ret, p.args, output=output)