2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 if type(c) is int: return c
199 # This is not clearly defined otherwise
200 compiled_regex_type = type(re.compile(''))
203 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
204 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
205 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206 'Accept-Encoding': 'gzip, deflate',
207 'Accept-Language': 'en-us,en;q=0.5',
210 def preferredencoding():
211 """Get preferred encoding.
213 Returns the best encoding scheme for the system, based on
214 locale.getpreferredencoding() and some further tweaks.
217 pref = locale.getpreferredencoding()
224 if sys.version_info < (3,0):
226 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
229 assert type(s) == type(u'')
233 def write_json_file(obj, fn):
234 """ Encode obj as JSON and write it to fn, atomically """
238 'prefix': os.path.basename(fn) + '.',
239 'dir': os.path.dirname(fn),
243 # In Python 2.x, json.dump expects a bytestream.
244 # In Python 3.x, it writes to a character stream
245 if sys.version_info < (3, 0):
253 tf = tempfile.NamedTemporaryFile(**args)
258 os.rename(tf.name, fn)
267 if sys.version_info >= (2, 7):
268 def find_xpath_attr(node, xpath, key, val):
269 """ Find the xpath xpath[@key=val] """
270 assert re.match(r'^[a-zA-Z-]+$', key)
271 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
272 expr = xpath + u"[@%s='%s']" % (key, val)
273 return node.find(expr)
275 def find_xpath_attr(node, xpath, key, val):
276 for f in node.findall(xpath):
277 if f.attrib.get(key) == val:
281 # On python2.6 the xml.etree.ElementTree.Element methods don't support
282 # the namespace parameter
283 def xpath_with_ns(path, ns_map):
284 components = [c.split(':') for c in path.split('/')]
288 replaced.append(c[0])
291 replaced.append('{%s}%s' % (ns_map[ns], tag))
292 return '/'.join(replaced)
294 def htmlentity_transform(matchobj):
295 """Transforms an HTML entity to a character.
297 This function receives a match object and is intended to be used with
298 the re.sub() function.
300 entity = matchobj.group(1)
302 # Known non-numeric HTML entity
303 if entity in compat_html_entities.name2codepoint:
304 return compat_chr(compat_html_entities.name2codepoint[entity])
306 mobj = re.match(u'(?u)#(x?\\d+)', entity)
308 numstr = mobj.group(1)
309 if numstr.startswith(u'x'):
311 numstr = u'0%s' % numstr
314 return compat_chr(int(numstr, base))
316 # Unknown entity in name, return its literal representation
317 return (u'&%s;' % entity)
319 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
320 class BaseHTMLParser(compat_html_parser.HTMLParser):
322 compat_html_parser.HTMLParser.__init__(self)
325 def loads(self, html):
330 class AttrParser(BaseHTMLParser):
331 """Modified HTMLParser that isolates a tag with the specified attribute"""
332 def __init__(self, attribute, value):
333 self.attribute = attribute
338 self.watch_startpos = False
340 BaseHTMLParser.__init__(self)
342 def error(self, message):
343 if self.error_count > 10 or self.started:
344 raise compat_html_parser.HTMLParseError(message, self.getpos())
345 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
346 self.error_count += 1
349 def handle_starttag(self, tag, attrs):
352 self.find_startpos(None)
353 if self.attribute in attrs and attrs[self.attribute] == self.value:
356 self.watch_startpos = True
358 if not tag in self.depth: self.depth[tag] = 0
361 def handle_endtag(self, tag):
363 if tag in self.depth: self.depth[tag] -= 1
364 if self.depth[self.result[0]] == 0:
366 self.result.append(self.getpos())
368 def find_startpos(self, x):
369 """Needed to put the start position of the result (self.result[1])
370 after the opening tag with the requested id"""
371 if self.watch_startpos:
372 self.watch_startpos = False
373 self.result.append(self.getpos())
374 handle_entityref = handle_charref = handle_data = handle_comment = \
375 handle_decl = handle_pi = unknown_decl = find_startpos
377 def get_result(self):
378 if self.result is None:
380 if len(self.result) != 3:
382 lines = self.html.split('\n')
383 lines = lines[self.result[1][0]-1:self.result[2][0]]
384 lines[0] = lines[0][self.result[1][1]:]
386 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
387 lines[-1] = lines[-1][:self.result[2][1]]
388 return '\n'.join(lines).strip()
389 # Hack for https://github.com/rg3/youtube-dl/issues/662
390 if sys.version_info < (2, 7, 3):
391 AttrParser.parse_endtag = (lambda self, i:
392 i + len("</scr'+'ipt>")
393 if self.rawdata[i:].startswith("</scr'+'ipt>")
394 else compat_html_parser.HTMLParser.parse_endtag(self, i))
396 def get_element_by_id(id, html):
397 """Return the content of the tag with the specified ID in the passed HTML document"""
398 return get_element_by_attribute("id", id, html)
400 def get_element_by_attribute(attribute, value, html):
401 """Return the content of the tag with the specified attribute in the passed HTML document"""
402 parser = AttrParser(attribute, value)
405 except compat_html_parser.HTMLParseError:
407 return parser.get_result()
409 class MetaParser(BaseHTMLParser):
411 Modified HTMLParser that isolates a meta tag with the specified name
414 def __init__(self, name):
415 BaseHTMLParser.__init__(self)
420 def handle_starttag(self, tag, attrs):
424 if attrs.get('name') == self.name:
425 self.result = attrs.get('content')
427 def get_result(self):
430 def get_meta_content(name, html):
432 Return the content attribute from the meta tag with the given name attribute.
434 parser = MetaParser(name)
437 except compat_html_parser.HTMLParseError:
439 return parser.get_result()
442 def clean_html(html):
443 """Clean an HTML snippet into a readable string"""
445 html = html.replace('\n', ' ')
446 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
447 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
449 html = re.sub('<.*?>', '', html)
450 # Replace html entities
451 html = unescapeHTML(html)
455 def sanitize_open(filename, open_mode):
456 """Try to open the given filename, and slightly tweak it if this fails.
458 Attempts to open the given filename. If this fails, it tries to change
459 the filename slightly, step by step, until it's either able to open it
460 or it fails and raises a final exception, like the standard open()
463 It returns the tuple (stream, definitive_file_name).
467 if sys.platform == 'win32':
469 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
470 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
471 stream = open(encodeFilename(filename), open_mode)
472 return (stream, filename)
473 except (IOError, OSError) as err:
474 if err.errno in (errno.EACCES,):
477 # In case of error, try to remove win32 forbidden chars
478 alt_filename = os.path.join(
479 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
480 for path_part in os.path.split(filename)
482 if alt_filename == filename:
485 # An exception here should be caught in the caller
486 stream = open(encodeFilename(filename), open_mode)
487 return (stream, alt_filename)
490 def timeconvert(timestr):
491 """Convert RFC 2822 defined time string into system timestamp"""
493 timetuple = email.utils.parsedate_tz(timestr)
494 if timetuple is not None:
495 timestamp = email.utils.mktime_tz(timetuple)
498 def sanitize_filename(s, restricted=False, is_id=False):
499 """Sanitizes a string so it could be used as part of a filename.
500 If restricted is set, use a stricter subset of allowed characters.
501 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
503 def replace_insane(char):
504 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return '' if restricted else '\''
509 return '_-' if restricted else ' -'
510 elif char in '\\/|*<>':
512 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
514 if restricted and ord(char) > 127:
518 result = u''.join(map(replace_insane, s))
520 while '__' in result:
521 result = result.replace('__', '_')
522 result = result.strip('_')
523 # Common case of "Foreign band name - English song title"
524 if restricted and result.startswith('-_'):
530 def orderedSet(iterable):
531 """ Remove all duplicates from the input iterable """
542 assert type(s) == compat_str
544 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
548 def encodeFilename(s, for_subprocess=False):
550 @param s The name of the file
553 assert type(s) == compat_str
555 # Python 3 has a Unicode API
556 if sys.version_info >= (3, 0):
559 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
560 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
561 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
562 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
563 if not for_subprocess:
566 # For subprocess calls, encode with locale encoding
567 # Refer to http://stackoverflow.com/a/9951851/35070
568 encoding = preferredencoding()
570 encoding = sys.getfilesystemencoding()
573 return s.encode(encoding, 'ignore')
576 def encodeArgument(s):
577 if not isinstance(s, compat_str):
578 # Legacy code that uses byte strings
579 # Uncomment the following line after fixing all post processors
580 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
581 s = s.decode('ascii')
582 return encodeFilename(s, True)
585 def decodeOption(optval):
588 if isinstance(optval, bytes):
589 optval = optval.decode(preferredencoding())
591 assert isinstance(optval, compat_str)
594 def formatSeconds(secs):
596 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
598 return '%d:%02d' % (secs // 60, secs % 60)
603 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
604 if sys.version_info < (3, 2):
607 class HTTPSConnectionV3(httplib.HTTPSConnection):
608 def __init__(self, *args, **kwargs):
609 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
612 sock = socket.create_connection((self.host, self.port), self.timeout)
613 if getattr(self, '_tunnel_host', False):
617 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
619 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
621 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
622 def https_open(self, req):
623 return self.do_open(HTTPSConnectionV3, req)
624 return HTTPSHandlerV3(**kwargs)
626 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
627 context.verify_mode = (ssl.CERT_NONE
628 if opts_no_check_certificate
629 else ssl.CERT_REQUIRED)
630 context.set_default_verify_paths()
632 context.load_default_certs()
633 except AttributeError:
635 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
637 class ExtractorError(Exception):
638 """Error during info extraction."""
639 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
640 """ tb, if given, is the original traceback (so that it can be printed out).
641 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
644 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
646 if video_id is not None:
647 msg = video_id + ': ' + msg
649 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
650 super(ExtractorError, self).__init__(msg)
653 self.exc_info = sys.exc_info() # preserve original exception
655 self.video_id = video_id
657 def format_traceback(self):
658 if self.traceback is None:
660 return u''.join(traceback.format_tb(self.traceback))
663 class RegexNotFoundError(ExtractorError):
664 """Error when a regex didn't match"""
668 class DownloadError(Exception):
669 """Download Error exception.
671 This exception may be thrown by FileDownloader objects if they are not
672 configured to continue on errors. They will contain the appropriate
675 def __init__(self, msg, exc_info=None):
676 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
677 super(DownloadError, self).__init__(msg)
678 self.exc_info = exc_info
681 class SameFileError(Exception):
682 """Same File exception.
684 This exception will be thrown by FileDownloader objects if they detect
685 multiple files would have to be downloaded to the same file on disk.
690 class PostProcessingError(Exception):
691 """Post Processing exception.
693 This exception may be raised by PostProcessor's .run() method to
694 indicate an error in the postprocessing task.
696 def __init__(self, msg):
699 class MaxDownloadsReached(Exception):
700 """ --max-downloads limit has been reached. """
704 class UnavailableVideoError(Exception):
705 """Unavailable Format exception.
707 This exception will be thrown when a video is requested
708 in a format that is not available for that video.
713 class ContentTooShortError(Exception):
714 """Content Too Short exception.
716 This exception may be raised by FileDownloader objects when a file they
717 download is too small for what the server announced first, indicating
718 the connection was probably interrupted.
724 def __init__(self, downloaded, expected):
725 self.downloaded = downloaded
726 self.expected = expected
728 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
729 """Handler for HTTP requests and responses.
731 This class, when installed with an OpenerDirector, automatically adds
732 the standard headers to every HTTP request and handles gzipped and
733 deflated responses from web servers. If compression is to be avoided in
734 a particular request, the original request in the program code only has
735 to include the HTTP header "Youtubedl-No-Compression", which will be
736 removed before making the real request.
738 Part of this code was copied from:
740 http://techknack.net/python-urllib2-handlers/
742 Andrew Rowls, the author of that code, agreed to release it to the
749 return zlib.decompress(data, -zlib.MAX_WBITS)
751 return zlib.decompress(data)
754 def addinfourl_wrapper(stream, headers, url, code):
755 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
756 return compat_urllib_request.addinfourl(stream, headers, url, code)
757 ret = compat_urllib_request.addinfourl(stream, headers, url)
761 def http_request(self, req):
762 for h,v in std_headers.items():
766 if 'Youtubedl-no-compression' in req.headers:
767 if 'Accept-encoding' in req.headers:
768 del req.headers['Accept-encoding']
769 del req.headers['Youtubedl-no-compression']
770 if 'Youtubedl-user-agent' in req.headers:
771 if 'User-agent' in req.headers:
772 del req.headers['User-agent']
773 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
774 del req.headers['Youtubedl-user-agent']
777 def http_response(self, req, resp):
780 if resp.headers.get('Content-encoding', '') == 'gzip':
781 content = resp.read()
782 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
784 uncompressed = io.BytesIO(gz.read())
785 except IOError as original_ioerror:
786 # There may be junk add the end of the file
787 # See http://stackoverflow.com/q/4928560/35070 for details
788 for i in range(1, 1024):
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
791 uncompressed = io.BytesIO(gz.read())
796 raise original_ioerror
797 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
798 resp.msg = old_resp.msg
800 if resp.headers.get('Content-encoding', '') == 'deflate':
801 gz = io.BytesIO(self.deflate(resp.read()))
802 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
803 resp.msg = old_resp.msg
806 https_request = http_request
807 https_response = http_response
810 def parse_iso8601(date_str, delimiter='T'):
811 """ Return a UNIX timestamp from the given date """
817 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
820 timezone = datetime.timedelta()
822 date_str = date_str[:-len(m.group(0))]
823 if not m.group('sign'):
824 timezone = datetime.timedelta()
826 sign = 1 if m.group('sign') == '+' else -1
827 timezone = datetime.timedelta(
828 hours=sign * int(m.group('hours')),
829 minutes=sign * int(m.group('minutes')))
830 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
831 dt = datetime.datetime.strptime(date_str, date_format) - timezone
832 return calendar.timegm(dt.timetuple())
835 def unified_strdate(date_str):
836 """Return a string with the date in the format YYYYMMDD"""
843 date_str = date_str.replace(',', ' ')
844 # %z (UTC offset) is only supported in python>=3.2
845 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
846 format_expressions = [
851 '%b %dst %Y %I:%M%p',
852 '%b %dnd %Y %I:%M%p',
853 '%b %dth %Y %I:%M%p',
863 '%Y-%m-%dT%H:%M:%SZ',
864 '%Y-%m-%dT%H:%M:%S.%fZ',
865 '%Y-%m-%dT%H:%M:%S.%f0Z',
867 '%Y-%m-%dT%H:%M:%S.%f',
870 for expression in format_expressions:
872 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
875 if upload_date is None:
876 timetuple = email.utils.parsedate_tz(date_str)
878 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
881 def determine_ext(url, default_ext=u'unknown_video'):
884 guess = url.partition(u'?')[0].rpartition(u'.')[2]
885 if re.match(r'^[A-Za-z0-9]+$', guess):
890 def subtitles_filename(filename, sub_lang, sub_format):
891 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
893 def date_from_str(date_str):
895 Return a datetime object from a string in the format YYYYMMDD or
896 (now|today)[+-][0-9](day|week|month|year)(s)?"""
897 today = datetime.date.today()
898 if date_str == 'now'or date_str == 'today':
900 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
901 if match is not None:
902 sign = match.group('sign')
903 time = int(match.group('time'))
906 unit = match.group('unit')
915 delta = datetime.timedelta(**{unit: time})
917 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
919 def hyphenate_date(date_str):
921 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
922 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
923 if match is not None:
924 return '-'.join(match.groups())
928 class DateRange(object):
929 """Represents a time interval between two dates"""
930 def __init__(self, start=None, end=None):
931 """start and end must be strings in the format accepted by date"""
932 if start is not None:
933 self.start = date_from_str(start)
935 self.start = datetime.datetime.min.date()
937 self.end = date_from_str(end)
939 self.end = datetime.datetime.max.date()
940 if self.start > self.end:
941 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
944 """Returns a range that only contains the given day"""
946 def __contains__(self, date):
947 """Check if the date is in the range"""
948 if not isinstance(date, datetime.date):
949 date = date_from_str(date)
950 return self.start <= date <= self.end
952 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
956 """ Returns the platform name as a compat_str """
957 res = platform.platform()
958 if isinstance(res, bytes):
959 res = res.decode(preferredencoding())
961 assert isinstance(res, compat_str)
965 def _windows_write_string(s, out):
966 """ Returns True if the string was written using special methods,
967 False if it has yet to be written out."""
968 # Adapted from http://stackoverflow.com/a/3259271/35070
971 import ctypes.wintypes
979 fileno = out.fileno()
980 except AttributeError:
981 # If the output stream doesn't have a fileno, it's virtual
983 if fileno not in WIN_OUTPUT_IDS:
986 GetStdHandle = ctypes.WINFUNCTYPE(
987 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
988 ("GetStdHandle", ctypes.windll.kernel32))
989 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
991 WriteConsoleW = ctypes.WINFUNCTYPE(
992 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
993 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
994 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
995 written = ctypes.wintypes.DWORD(0)
997 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
998 FILE_TYPE_CHAR = 0x0002
999 FILE_TYPE_REMOTE = 0x8000
1000 GetConsoleMode = ctypes.WINFUNCTYPE(
1001 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1002 ctypes.POINTER(ctypes.wintypes.DWORD))(
1003 ("GetConsoleMode", ctypes.windll.kernel32))
1004 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1006 def not_a_console(handle):
1007 if handle == INVALID_HANDLE_VALUE or handle is None:
1009 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1010 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1012 if not_a_console(h):
1015 def next_nonbmp_pos(s):
1017 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1018 except StopIteration:
1022 count = min(next_nonbmp_pos(s), 1024)
1024 ret = WriteConsoleW(
1025 h, s, count if count else 2, ctypes.byref(written), None)
1027 raise OSError('Failed to write string')
1028 if not count: # We just wrote a non-BMP character
1029 assert written.value == 2
1032 assert written.value > 0
1033 s = s[written.value:]
1037 def write_string(s, out=None, encoding=None):
1040 assert type(s) == compat_str
1042 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1043 if _windows_write_string(s, out):
1046 if ('b' in getattr(out, 'mode', '') or
1047 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1048 byt = s.encode(encoding or preferredencoding(), 'ignore')
1050 elif hasattr(out, 'buffer'):
1051 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1052 byt = s.encode(enc, 'ignore')
1053 out.buffer.write(byt)
1059 def bytes_to_intlist(bs):
1062 if isinstance(bs[0], int): # Python 3
1065 return [ord(c) for c in bs]
1068 def intlist_to_bytes(xs):
1071 if isinstance(chr(0), bytes): # Python 2
1072 return ''.join([chr(x) for x in xs])
1077 def get_cachedir(params={}):
1078 cache_root = os.environ.get('XDG_CACHE_HOME',
1079 os.path.expanduser('~/.cache'))
1080 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1083 # Cross-platform file locking
1084 if sys.platform == 'win32':
1085 import ctypes.wintypes
1088 class OVERLAPPED(ctypes.Structure):
1090 ('Internal', ctypes.wintypes.LPVOID),
1091 ('InternalHigh', ctypes.wintypes.LPVOID),
1092 ('Offset', ctypes.wintypes.DWORD),
1093 ('OffsetHigh', ctypes.wintypes.DWORD),
1094 ('hEvent', ctypes.wintypes.HANDLE),
1097 kernel32 = ctypes.windll.kernel32
1098 LockFileEx = kernel32.LockFileEx
1099 LockFileEx.argtypes = [
1100 ctypes.wintypes.HANDLE, # hFile
1101 ctypes.wintypes.DWORD, # dwFlags
1102 ctypes.wintypes.DWORD, # dwReserved
1103 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1104 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1105 ctypes.POINTER(OVERLAPPED) # Overlapped
1107 LockFileEx.restype = ctypes.wintypes.BOOL
1108 UnlockFileEx = kernel32.UnlockFileEx
1109 UnlockFileEx.argtypes = [
1110 ctypes.wintypes.HANDLE, # hFile
1111 ctypes.wintypes.DWORD, # dwReserved
1112 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1113 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1114 ctypes.POINTER(OVERLAPPED) # Overlapped
1116 UnlockFileEx.restype = ctypes.wintypes.BOOL
1117 whole_low = 0xffffffff
1118 whole_high = 0x7fffffff
1120 def _lock_file(f, exclusive):
1121 overlapped = OVERLAPPED()
1122 overlapped.Offset = 0
1123 overlapped.OffsetHigh = 0
1124 overlapped.hEvent = 0
1125 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1126 handle = msvcrt.get_osfhandle(f.fileno())
1127 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1128 whole_low, whole_high, f._lock_file_overlapped_p):
1129 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1131 def _unlock_file(f):
1132 assert f._lock_file_overlapped_p
1133 handle = msvcrt.get_osfhandle(f.fileno())
1134 if not UnlockFileEx(handle, 0,
1135 whole_low, whole_high, f._lock_file_overlapped_p):
1136 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1141 def _lock_file(f, exclusive):
1142 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1144 def _unlock_file(f):
1145 fcntl.lockf(f, fcntl.LOCK_UN)
1148 class locked_file(object):
1149 def __init__(self, filename, mode, encoding=None):
1150 assert mode in ['r', 'a', 'w']
1151 self.f = io.open(filename, mode, encoding=encoding)
1154 def __enter__(self):
1155 exclusive = self.mode != 'r'
1157 _lock_file(self.f, exclusive)
1163 def __exit__(self, etype, value, traceback):
1165 _unlock_file(self.f)
1172 def write(self, *args):
1173 return self.f.write(*args)
1175 def read(self, *args):
1176 return self.f.read(*args)
1179 def shell_quote(args):
1181 encoding = sys.getfilesystemencoding()
1182 if encoding is None:
1185 if isinstance(a, bytes):
1186 # We may get a filename encoded with 'encodeFilename'
1187 a = a.decode(encoding)
1188 quoted_args.append(pipes.quote(a))
1189 return u' '.join(quoted_args)
1192 def takewhile_inclusive(pred, seq):
1193 """ Like itertools.takewhile, but include the latest evaluated element
1194 (the first element so that Not pred(e)) """
1201 def smuggle_url(url, data):
1202 """ Pass additional data in a URL for internal use. """
1204 sdata = compat_urllib_parse.urlencode(
1205 {u'__youtubedl_smuggle': json.dumps(data)})
1206 return url + u'#' + sdata
1209 def unsmuggle_url(smug_url, default=None):
1210 if not '#__youtubedl_smuggle' in smug_url:
1211 return smug_url, default
1212 url, _, sdata = smug_url.rpartition(u'#')
1213 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1214 data = json.loads(jsond)
1218 def format_bytes(bytes):
1221 if type(bytes) is str:
1222 bytes = float(bytes)
1226 exponent = int(math.log(bytes, 1024.0))
1227 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1228 converted = float(bytes) / float(1024 ** exponent)
1229 return u'%.2f%s' % (converted, suffix)
1232 def get_term_width():
1233 columns = os.environ.get('COLUMNS', None)
1238 sp = subprocess.Popen(
1240 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1241 out, err = sp.communicate()
1242 return int(out.split()[1])
1248 def month_by_name(name):
1249 """ Return the number of a month by (locale-independently) English name """
1252 u'January', u'February', u'March', u'April', u'May', u'June',
1253 u'July', u'August', u'September', u'October', u'November', u'December']
1255 return ENGLISH_NAMES.index(name) + 1
1260 def fix_xml_ampersands(xml_str):
1261 """Replace all the '&' by '&' in XML"""
1263 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1268 def setproctitle(title):
1269 assert isinstance(title, compat_str)
1271 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1274 title_bytes = title.encode('utf-8')
1275 buf = ctypes.create_string_buffer(len(title_bytes))
1276 buf.value = title_bytes
1278 libc.prctl(15, buf, 0, 0, 0)
1279 except AttributeError:
1280 return # Strange libc, just skip this
1283 def remove_start(s, start):
1284 if s.startswith(start):
1285 return s[len(start):]
1289 def remove_end(s, end):
1291 return s[:-len(end)]
1295 def url_basename(url):
1296 path = compat_urlparse.urlparse(url).path
1297 return path.strip(u'/').split(u'/')[-1]
1300 class HEADRequest(compat_urllib_request.Request):
1301 def get_method(self):
1305 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1308 v = getattr(v, get_attr, None)
1311 return default if v is None else (int(v) * invscale // scale)
1314 def str_or_none(v, default=None):
1315 return default if v is None else compat_str(v)
1318 def str_to_int(int_str):
1321 int_str = re.sub(r'[,\.]', u'', int_str)
1325 def float_or_none(v, scale=1, invscale=1, default=None):
1326 return default if v is None else (float(v) * invscale / scale)
1329 def parse_duration(s):
1334 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1337 res = int(m.group('secs'))
1339 res += int(m.group('mins')) * 60
1340 if m.group('hours'):
1341 res += int(m.group('hours')) * 60 * 60
1345 def prepend_extension(filename, ext):
1346 name, real_ext = os.path.splitext(filename)
1347 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1350 def check_executable(exe, args=[]):
1351 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1352 args can be a list of arguments for a short output (like -version) """
1354 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1360 class PagedList(object):
1361 def __init__(self, pagefunc, pagesize):
1362 self._pagefunc = pagefunc
1363 self._pagesize = pagesize
1366 # This is only useful for tests
1367 return len(self.getslice())
1369 def getslice(self, start=0, end=None):
1371 for pagenum in itertools.count(start // self._pagesize):
1372 firstid = pagenum * self._pagesize
1373 nextfirstid = pagenum * self._pagesize + self._pagesize
1374 if start >= nextfirstid:
1377 page_results = list(self._pagefunc(pagenum))
1380 start % self._pagesize
1381 if firstid <= start < nextfirstid
1385 ((end - 1) % self._pagesize) + 1
1386 if (end is not None and firstid <= end <= nextfirstid)
1389 if startv != 0 or endv is not None:
1390 page_results = page_results[startv:endv]
1391 res.extend(page_results)
1393 # A little optimization - if current page is not "full", ie. does
1394 # not contain page_size videos then we can assume that this page
1395 # is the last one - there are no more ids on further pages -
1396 # i.e. no need to query again.
1397 if len(page_results) + startv < self._pagesize:
1400 # If we got the whole page, but the next page is not interesting,
1401 # break out early as well
1402 if end == nextfirstid:
1407 def uppercase_escape(s):
1408 unicode_escape = codecs.getdecoder('unicode_escape')
1410 r'\\U[0-9a-fA-F]{8}',
1411 lambda m: unicode_escape(m.group(0))[0],
1415 struct.pack(u'!I', 0)
1417 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1418 def struct_pack(spec, *args):
1419 if isinstance(spec, compat_str):
1420 spec = spec.encode('ascii')
1421 return struct.pack(spec, *args)
1423 def struct_unpack(spec, *args):
1424 if isinstance(spec, compat_str):
1425 spec = spec.encode('ascii')
1426 return struct.unpack(spec, *args)
1428 struct_pack = struct.pack
1429 struct_unpack = struct.unpack
1432 def read_batch_urls(batch_fd):
1434 if not isinstance(url, compat_str):
1435 url = url.decode('utf-8', 'replace')
1436 BOM_UTF8 = u'\xef\xbb\xbf'
1437 if url.startswith(BOM_UTF8):
1438 url = url[len(BOM_UTF8):]
1440 if url.startswith(('#', ';', ']')):
1444 with contextlib.closing(batch_fd) as fd:
1445 return [url for url in map(fixup, fd) if url]
1448 def urlencode_postdata(*args, **kargs):
1449 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1453 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1454 def doctype(self, name, pubid, system):
1455 pass # Ignore doctypes
1457 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1458 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1459 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1462 if sys.version_info < (3, 0) and sys.platform == 'win32':
1463 def compat_getpass(prompt, *args, **kwargs):
1464 if isinstance(prompt, compat_str):
1465 prompt = prompt.encode(preferredencoding())
1466 return getpass.getpass(prompt, *args, **kwargs)
1468 compat_getpass = getpass.getpass
1480 def strip_jsonp(code):
1481 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1484 def js_to_json(code):
1487 if key.startswith("'"):
1488 assert key.endswith("'")
1489 assert '"' not in key
1490 key = '"%s"' % key[1:-1]
1491 elif not key.startswith('"'):
1495 if value.startswith("'"):
1496 assert value.endswith("'")
1497 assert '"' not in value
1498 value = '"%s"' % value[1:-1]
1500 return m.group(1) + key + m.group(3) + value
1502 res = re.sub(r'''(?x)
1504 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1506 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1508 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1512 def qualities(quality_ids):
1513 """ Get a numeric quality value out of a list of possible values """
1516 return quality_ids.index(qid)
1522 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1525 subprocess_check_output = subprocess.check_output
1526 except AttributeError:
1527 def subprocess_check_output(*args, **kwargs):
1528 assert 'input' not in kwargs
1529 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1530 output, _ = p.communicate()
1533 raise subprocess.CalledProcessError(ret, p.args, output=output)