_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import errno
   5 import gzip
   6 import io
   7 import json
   8 import locale
   9 import os
  10 import re
  11 import sys
  12 import traceback
  13 import zlib
  14 import email.utils
  15 import json
  16 import datetime
  17
  18 try:
  19     import urllib.request as compat_urllib_request
  20 except ImportError: # Python 2
  21     import urllib2 as compat_urllib_request
  22
  23 try:
  24     import urllib.error as compat_urllib_error
  25 except ImportError: # Python 2
  26     import urllib2 as compat_urllib_error
  27
  28 try:
  29     import urllib.parse as compat_urllib_parse
  30 except ImportError: # Python 2
  31     import urllib as compat_urllib_parse
  32
  33 try:
  34     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  35 except ImportError: # Python 2
  36     from urlparse import urlparse as compat_urllib_parse_urlparse
  37
  38 try:
  39     import http.cookiejar as compat_cookiejar
  40 except ImportError: # Python 2
  41     import cookielib as compat_cookiejar
  42
  43 try:
  44     import html.entities as compat_html_entities
  45 except ImportError: # Python 2
  46     import htmlentitydefs as compat_html_entities
  47
  48 try:
  49     import html.parser as compat_html_parser
  50 except ImportError: # Python 2
  51     import HTMLParser as compat_html_parser
  52
  53 try:
  54     import http.client as compat_http_client
  55 except ImportError: # Python 2
  56     import httplib as compat_http_client
  57
  58 try:
  59     from subprocess import DEVNULL
  60     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  61 except ImportError:
  62     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  63
  64 try:
  65     from urllib.parse import parse_qs as compat_parse_qs
  66 except ImportError: # Python 2
  67     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  68     # Python 2's version is apparently totally broken
  69     def _unquote(string, encoding='utf-8', errors='replace'):
  70         if string == '':
  71             return string
  72         res = string.split('%')
  73         if len(res) == 1:
  74             return string
  75         if encoding is None:
  76             encoding = 'utf-8'
  77         if errors is None:
  78             errors = 'replace'
  79         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  80         pct_sequence = b''
  81         string = res[0]
  82         for item in res[1:]:
  83             try:
  84                 if not item:
  85                     raise ValueError
  86                 pct_sequence += item[:2].decode('hex')
  87                 rest = item[2:]
  88                 if not rest:
  89                     # This segment was just a single percent-encoded character.
  90                     # May be part of a sequence of code units, so delay decoding.
  91                     # (Stored in pct_sequence).
  92                     continue
  93             except ValueError:
  94                 rest = '%' + item
  95             # Encountered non-percent-encoded characters. Flush the current
  96             # pct_sequence.
  97             string += pct_sequence.decode(encoding, errors) + rest
  98             pct_sequence = b''
  99         if pct_sequence:
 100             # Flush the final pct_sequence
 101             string += pct_sequence.decode(encoding, errors)
 102         return string
 103
 104     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 105                 encoding='utf-8', errors='replace'):
 106         qs, _coerce_result = qs, unicode
 107         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 108         r = []
 109         for name_value in pairs:
 110             if not name_value and not strict_parsing:
 111                 continue
 112             nv = name_value.split('=', 1)
 113             if len(nv) != 2:
 114                 if strict_parsing:
 115                     raise ValueError("bad query field: %r" % (name_value,))
 116                 # Handle case of a control-name with no equal sign
 117                 if keep_blank_values:
 118                     nv.append('')
 119                 else:
 120                     continue
 121             if len(nv[1]) or keep_blank_values:
 122                 name = nv[0].replace('+', ' ')
 123                 name = _unquote(name, encoding=encoding, errors=errors)
 124                 name = _coerce_result(name)
 125                 value = nv[1].replace('+', ' ')
 126                 value = _unquote(value, encoding=encoding, errors=errors)
 127                 value = _coerce_result(value)
 128                 r.append((name, value))
 129         return r
 130
 131     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 132                 encoding='utf-8', errors='replace'):
 133         parsed_result = {}
 134         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 135                         encoding=encoding, errors=errors)
 136         for name, value in pairs:
 137             if name in parsed_result:
 138                 parsed_result[name].append(value)
 139             else:
 140                 parsed_result[name] = [value]
 141         return parsed_result
 142
 143 try:
 144     compat_str = unicode # Python 2
 145 except NameError:
 146     compat_str = str
 147
 148 try:
 149     compat_chr = unichr # Python 2
 150 except NameError:
 151     compat_chr = chr
 152
 153 def compat_ord(c):
 154     if type(c) is int: return c
 155     else: return ord(c)
 156
 157 std_headers = {
 158     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 159     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 160     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 161     'Accept-Encoding': 'gzip, deflate',
 162     'Accept-Language': 'en-us,en;q=0.5',
 163 }
 164
 165 def preferredencoding():
 166     """Get preferred encoding.
 167
 168     Returns the best encoding scheme for the system, based on
 169     locale.getpreferredencoding() and some further tweaks.
 170     """
 171     try:
 172         pref = locale.getpreferredencoding()
 173         u'TEST'.encode(pref)
 174     except:
 175         pref = 'UTF-8'
 176
 177     return pref
 178
 179 if sys.version_info < (3,0):
 180     def compat_print(s):
 181         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 182 else:
 183     def compat_print(s):
 184         assert type(s) == type(u'')
 185         print(s)
 186
 187 # In Python 2.x, json.dump expects a bytestream.
 188 # In Python 3.x, it writes to a character stream
 189 if sys.version_info < (3,0):
 190     def write_json_file(obj, fn):
 191         with open(fn, 'wb') as f:
 192             json.dump(obj, f)
 193 else:
 194     def write_json_file(obj, fn):
 195         with open(fn, 'w', encoding='utf-8') as f:
 196             json.dump(obj, f)
 197
 198 def htmlentity_transform(matchobj):
 199     """Transforms an HTML entity to a character.
 200
 201     This function receives a match object and is intended to be used with
 202     the re.sub() function.
 203     """
 204     entity = matchobj.group(1)
 205
 206     # Known non-numeric HTML entity
 207     if entity in compat_html_entities.name2codepoint:
 208         return compat_chr(compat_html_entities.name2codepoint[entity])
 209
 210     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 211     if mobj is not None:
 212         numstr = mobj.group(1)
 213         if numstr.startswith(u'x'):
 214             base = 16
 215             numstr = u'0%s' % numstr
 216         else:
 217             base = 10
 218         return compat_chr(int(numstr, base))
 219
 220     # Unknown entity in name, return its literal representation
 221     return (u'&%s;' % entity)
 222
 223 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 224 class AttrParser(compat_html_parser.HTMLParser):
 225     """Modified HTMLParser that isolates a tag with the specified attribute"""
 226     def __init__(self, attribute, value):
 227         self.attribute = attribute
 228         self.value = value
 229         self.result = None
 230         self.started = False
 231         self.depth = {}
 232         self.html = None
 233         self.watch_startpos = False
 234         self.error_count = 0
 235         compat_html_parser.HTMLParser.__init__(self)
 236
 237     def error(self, message):
 238         if self.error_count > 10 or self.started:
 239             raise compat_html_parser.HTMLParseError(message, self.getpos())
 240         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 241         self.error_count += 1
 242         self.goahead(1)
 243
 244     def loads(self, html):
 245         self.html = html
 246         self.feed(html)
 247         self.close()
 248
 249     def handle_starttag(self, tag, attrs):
 250         attrs = dict(attrs)
 251         if self.started:
 252             self.find_startpos(None)
 253         if self.attribute in attrs and attrs[self.attribute] == self.value:
 254             self.result = [tag]
 255             self.started = True
 256             self.watch_startpos = True
 257         if self.started:
 258             if not tag in self.depth: self.depth[tag] = 0
 259             self.depth[tag] += 1
 260
 261     def handle_endtag(self, tag):
 262         if self.started:
 263             if tag in self.depth: self.depth[tag] -= 1
 264             if self.depth[self.result[0]] == 0:
 265                 self.started = False
 266                 self.result.append(self.getpos())
 267
 268     def find_startpos(self, x):
 269         """Needed to put the start position of the result (self.result[1])
 270         after the opening tag with the requested id"""
 271         if self.watch_startpos:
 272             self.watch_startpos = False
 273             self.result.append(self.getpos())
 274     handle_entityref = handle_charref = handle_data = handle_comment = \
 275     handle_decl = handle_pi = unknown_decl = find_startpos
 276
 277     def get_result(self):
 278         if self.result is None:
 279             return None
 280         if len(self.result) != 3:
 281             return None
 282         lines = self.html.split('\n')
 283         lines = lines[self.result[1][0]-1:self.result[2][0]]
 284         lines[0] = lines[0][self.result[1][1]:]
 285         if len(lines) == 1:
 286             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 287         lines[-1] = lines[-1][:self.result[2][1]]
 288         return '\n'.join(lines).strip()
 289 # Hack for https://github.com/rg3/youtube-dl/issues/662
 290 if sys.version_info < (2, 7, 3):
 291     AttrParser.parse_endtag = (lambda self, i:
 292         i + len("</scr'+'ipt>")
 293         if self.rawdata[i:].startswith("</scr'+'ipt>")
 294         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 295
 296 def get_element_by_id(id, html):
 297     """Return the content of the tag with the specified ID in the passed HTML document"""
 298     return get_element_by_attribute("id", id, html)
 299
 300 def get_element_by_attribute(attribute, value, html):
 301     """Return the content of the tag with the specified attribute in the passed HTML document"""
 302     parser = AttrParser(attribute, value)
 303     try:
 304         parser.loads(html)
 305     except compat_html_parser.HTMLParseError:
 306         pass
 307     return parser.get_result()
 308
 309
 310 def clean_html(html):
 311     """Clean an HTML snippet into a readable string"""
 312     # Newline vs <br />
 313     html = html.replace('\n', ' ')
 314     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 315     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 316     # Strip html tags
 317     html = re.sub('<.*?>', '', html)
 318     # Replace html entities
 319     html = unescapeHTML(html)
 320     return html.strip()
 321
 322
 323 def sanitize_open(filename, open_mode):
 324     """Try to open the given filename, and slightly tweak it if this fails.
 325
 326     Attempts to open the given filename. If this fails, it tries to change
 327     the filename slightly, step by step, until it's either able to open it
 328     or it fails and raises a final exception, like the standard open()
 329     function.
 330
 331     It returns the tuple (stream, definitive_file_name).
 332     """
 333     try:
 334         if filename == u'-':
 335             if sys.platform == 'win32':
 336                 import msvcrt
 337                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 338             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 339         stream = open(encodeFilename(filename), open_mode)
 340         return (stream, filename)
 341     except (IOError, OSError) as err:
 342         if err.errno in (errno.EACCES,):
 343             raise
 344
 345         # In case of error, try to remove win32 forbidden chars
 346         alt_filename = os.path.join(
 347                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 348                         for path_part in os.path.split(filename)
 349                        )
 350         if alt_filename == filename:
 351             raise
 352         else:
 353             # An exception here should be caught in the caller
 354             stream = open(encodeFilename(filename), open_mode)
 355             return (stream, alt_filename)
 356
 357
 358 def timeconvert(timestr):
 359     """Convert RFC 2822 defined time string into system timestamp"""
 360     timestamp = None
 361     timetuple = email.utils.parsedate_tz(timestr)
 362     if timetuple is not None:
 363         timestamp = email.utils.mktime_tz(timetuple)
 364     return timestamp
 365
 366 def sanitize_filename(s, restricted=False, is_id=False):
 367     """Sanitizes a string so it could be used as part of a filename.
 368     If restricted is set, use a stricter subset of allowed characters.
 369     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 370     """
 371     def replace_insane(char):
 372         if char == '?' or ord(char) < 32 or ord(char) == 127:
 373             return ''
 374         elif char == '"':
 375             return '' if restricted else '\''
 376         elif char == ':':
 377             return '_-' if restricted else ' -'
 378         elif char in '\\/|*<>':
 379             return '_'
 380         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 381             return '_'
 382         if restricted and ord(char) > 127:
 383             return '_'
 384         return char
 385
 386     result = u''.join(map(replace_insane, s))
 387     if not is_id:
 388         while '__' in result:
 389             result = result.replace('__', '_')
 390         result = result.strip('_')
 391         # Common case of "Foreign band name - English song title"
 392         if restricted and result.startswith('-_'):
 393             result = result[2:]
 394         if not result:
 395             result = '_'
 396     return result
 397
 398 def orderedSet(iterable):
 399     """ Remove all duplicates from the input iterable """
 400     res = []
 401     for el in iterable:
 402         if el not in res:
 403             res.append(el)
 404     return res
 405
 406 def unescapeHTML(s):
 407     """
 408     @param s a string
 409     """
 410     assert type(s) == type(u'')
 411
 412     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 413     return result
 414
 415 def encodeFilename(s):
 416     """
 417     @param s The name of the file
 418     """
 419
 420     assert type(s) == type(u'')
 421
 422     # Python 3 has a Unicode API
 423     if sys.version_info >= (3, 0):
 424         return s
 425
 426     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 427         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 428         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 429         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 430         return s
 431     else:
 432         encoding = sys.getfilesystemencoding()
 433         if encoding is None:
 434             encoding = 'utf-8'
 435         return s.encode(encoding, 'ignore')
 436
 437 def decodeOption(optval):
 438     if optval is None:
 439         return optval
 440     if isinstance(optval, bytes):
 441         optval = optval.decode(preferredencoding())
 442
 443     assert isinstance(optval, compat_str)
 444     return optval
 445
 446 def formatSeconds(secs):
 447     if secs > 3600:
 448         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 449     elif secs > 60:
 450         return '%d:%02d' % (secs // 60, secs % 60)
 451     else:
 452         return '%d' % secs
 453
 454 def make_HTTPS_handler(opts):
 455     if sys.version_info < (3,2):
 456         # Python's 2.x handler is very simplistic
 457         return compat_urllib_request.HTTPSHandler()
 458     else:
 459         import ssl
 460         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 461         context.set_default_verify_paths()
 462
 463         context.verify_mode = (ssl.CERT_NONE
 464                                if opts.no_check_certificate
 465                                else ssl.CERT_REQUIRED)
 466         return compat_urllib_request.HTTPSHandler(context=context)
 467
 468 class ExtractorError(Exception):
 469     """Error during info extraction."""
 470     def __init__(self, msg, tb=None):
 471         """ tb, if given, is the original traceback (so that it can be printed out). """
 472         super(ExtractorError, self).__init__(msg)
 473         self.traceback = tb
 474         self.exc_info = sys.exc_info()  # preserve original exception
 475
 476     def format_traceback(self):
 477         if self.traceback is None:
 478             return None
 479         return u''.join(traceback.format_tb(self.traceback))
 480
 481
 482 class DownloadError(Exception):
 483     """Download Error exception.
 484
 485     This exception may be thrown by FileDownloader objects if they are not
 486     configured to continue on errors. They will contain the appropriate
 487     error message.
 488     """
 489     def __init__(self, msg, exc_info=None):
 490         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 491         super(DownloadError, self).__init__(msg)
 492         self.exc_info = exc_info
 493
 494
 495 class SameFileError(Exception):
 496     """Same File exception.
 497
 498     This exception will be thrown by FileDownloader objects if they detect
 499     multiple files would have to be downloaded to the same file on disk.
 500     """
 501     pass
 502
 503
 504 class PostProcessingError(Exception):
 505     """Post Processing exception.
 506
 507     This exception may be raised by PostProcessor's .run() method to
 508     indicate an error in the postprocessing task.
 509     """
 510     def __init__(self, msg):
 511         self.msg = msg
 512
 513 class MaxDownloadsReached(Exception):
 514     """ --max-downloads limit has been reached. """
 515     pass
 516
 517
 518 class UnavailableVideoError(Exception):
 519     """Unavailable Format exception.
 520
 521     This exception will be thrown when a video is requested
 522     in a format that is not available for that video.
 523     """
 524     pass
 525
 526
 527 class ContentTooShortError(Exception):
 528     """Content Too Short exception.
 529
 530     This exception may be raised by FileDownloader objects when a file they
 531     download is too small for what the server announced first, indicating
 532     the connection was probably interrupted.
 533     """
 534     # Both in bytes
 535     downloaded = None
 536     expected = None
 537
 538     def __init__(self, downloaded, expected):
 539         self.downloaded = downloaded
 540         self.expected = expected
 541
 542 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 543     """Handler for HTTP requests and responses.
 544
 545     This class, when installed with an OpenerDirector, automatically adds
 546     the standard headers to every HTTP request and handles gzipped and
 547     deflated responses from web servers. If compression is to be avoided in
 548     a particular request, the original request in the program code only has
 549     to include the HTTP header "Youtubedl-No-Compression", which will be
 550     removed before making the real request.
 551
 552     Part of this code was copied from:
 553
 554     http://techknack.net/python-urllib2-handlers/
 555
 556     Andrew Rowls, the author of that code, agreed to release it to the
 557     public domain.
 558     """
 559
 560     @staticmethod
 561     def deflate(data):
 562         try:
 563             return zlib.decompress(data, -zlib.MAX_WBITS)
 564         except zlib.error:
 565             return zlib.decompress(data)
 566
 567     @staticmethod
 568     def addinfourl_wrapper(stream, headers, url, code):
 569         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 570             return compat_urllib_request.addinfourl(stream, headers, url, code)
 571         ret = compat_urllib_request.addinfourl(stream, headers, url)
 572         ret.code = code
 573         return ret
 574
 575     def http_request(self, req):
 576         for h,v in std_headers.items():
 577             if h in req.headers:
 578                 del req.headers[h]
 579             req.add_header(h, v)
 580         if 'Youtubedl-no-compression' in req.headers:
 581             if 'Accept-encoding' in req.headers:
 582                 del req.headers['Accept-encoding']
 583             del req.headers['Youtubedl-no-compression']
 584         if 'Youtubedl-user-agent' in req.headers:
 585             if 'User-agent' in req.headers:
 586                 del req.headers['User-agent']
 587             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 588             del req.headers['Youtubedl-user-agent']
 589         return req
 590
 591     def http_response(self, req, resp):
 592         old_resp = resp
 593         # gzip
 594         if resp.headers.get('Content-encoding', '') == 'gzip':
 595             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 596             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 597             resp.msg = old_resp.msg
 598         # deflate
 599         if resp.headers.get('Content-encoding', '') == 'deflate':
 600             gz = io.BytesIO(self.deflate(resp.read()))
 601             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 602             resp.msg = old_resp.msg
 603         return resp
 604
 605     https_request = http_request
 606     https_response = http_response
 607
 608 def unified_strdate(date_str):
 609     """Return a string with the date in the format YYYYMMDD"""
 610     upload_date = None
 611     #Replace commas
 612     date_str = date_str.replace(',',' ')
 613     # %z (UTC offset) is only supported in python>=3.2
 614     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 615     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
 616     for expression in format_expressions:
 617         try:
 618             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 619         except:
 620             pass
 621     return upload_date
 622
 623 def date_from_str(date_str):
 624     """
 625     Return a datetime object from a string in the format YYYYMMDD or
 626     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 627     today = datetime.date.today()
 628     if date_str == 'now'or date_str == 'today':
 629         return today
 630     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 631     if match is not None:
 632         sign = match.group('sign')
 633         time = int(match.group('time'))
 634         if sign == '-':
 635             time = -time
 636         unit = match.group('unit')
 637         #A bad aproximation?
 638         if unit == 'month':
 639             unit = 'day'
 640             time *= 30
 641         elif unit == 'year':
 642             unit = 'day'
 643             time *= 365
 644         unit += 's'
 645         delta = datetime.timedelta(**{unit: time})
 646         return today + delta
 647     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 648
 649 class DateRange(object):
 650     """Represents a time interval between two dates"""
 651     def __init__(self, start=None, end=None):
 652         """start and end must be strings in the format accepted by date"""
 653         if start is not None:
 654             self.start = date_from_str(start)
 655         else:
 656             self.start = datetime.datetime.min.date()
 657         if end is not None:
 658             self.end = date_from_str(end)
 659         else:
 660             self.end = datetime.datetime.max.date()
 661         if self.start > self.end:
 662             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 663     @classmethod
 664     def day(cls, day):
 665         """Returns a range that only contains the given day"""
 666         return cls(day,day)
 667     def __contains__(self, date):
 668         """Check if the date is in the range"""
 669         if not isinstance(date, datetime.date):
 670             date = date_from_str(date)
 671         return self.start <= date <= self.end
 672     def __str__(self):
 673         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())