_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import errno
   5 import gzip
   6 import io
   7 import json
   8 import locale
   9 import os
  10 import re
  11 import sys
  12 import traceback
  13 import zlib
  14 import email.utils
  15 import json
  16 import datetime
  17
  18 try:
  19     import urllib.request as compat_urllib_request
  20 except ImportError: # Python 2
  21     import urllib2 as compat_urllib_request
  22
  23 try:
  24     import urllib.error as compat_urllib_error
  25 except ImportError: # Python 2
  26     import urllib2 as compat_urllib_error
  27
  28 try:
  29     import urllib.parse as compat_urllib_parse
  30 except ImportError: # Python 2
  31     import urllib as compat_urllib_parse
  32
  33 try:
  34     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  35 except ImportError: # Python 2
  36     from urlparse import urlparse as compat_urllib_parse_urlparse
  37
  38 try:
  39     import http.cookiejar as compat_cookiejar
  40 except ImportError: # Python 2
  41     import cookielib as compat_cookiejar
  42
  43 try:
  44     import html.entities as compat_html_entities
  45 except ImportError: # Python 2
  46     import htmlentitydefs as compat_html_entities
  47
  48 try:
  49     import html.parser as compat_html_parser
  50 except ImportError: # Python 2
  51     import HTMLParser as compat_html_parser
  52
  53 try:
  54     import http.client as compat_http_client
  55 except ImportError: # Python 2
  56     import httplib as compat_http_client
  57
  58 try:
  59     from subprocess import DEVNULL
  60     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  61 except ImportError:
  62     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  63
  64 try:
  65     from urllib.parse import parse_qs as compat_parse_qs
  66 except ImportError: # Python 2
  67     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  68     # Python 2's version is apparently totally broken
  69     def _unquote(string, encoding='utf-8', errors='replace'):
  70         if string == '':
  71             return string
  72         res = string.split('%')
  73         if len(res) == 1:
  74             return string
  75         if encoding is None:
  76             encoding = 'utf-8'
  77         if errors is None:
  78             errors = 'replace'
  79         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  80         pct_sequence = b''
  81         string = res[0]
  82         for item in res[1:]:
  83             try:
  84                 if not item:
  85                     raise ValueError
  86                 pct_sequence += item[:2].decode('hex')
  87                 rest = item[2:]
  88                 if not rest:
  89                     # This segment was just a single percent-encoded character.
  90                     # May be part of a sequence of code units, so delay decoding.
  91                     # (Stored in pct_sequence).
  92                     continue
  93             except ValueError:
  94                 rest = '%' + item
  95             # Encountered non-percent-encoded characters. Flush the current
  96             # pct_sequence.
  97             string += pct_sequence.decode(encoding, errors) + rest
  98             pct_sequence = b''
  99         if pct_sequence:
 100             # Flush the final pct_sequence
 101             string += pct_sequence.decode(encoding, errors)
 102         return string
 103
 104     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 105                 encoding='utf-8', errors='replace'):
 106         qs, _coerce_result = qs, unicode
 107         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 108         r = []
 109         for name_value in pairs:
 110             if not name_value and not strict_parsing:
 111                 continue
 112             nv = name_value.split('=', 1)
 113             if len(nv) != 2:
 114                 if strict_parsing:
 115                     raise ValueError("bad query field: %r" % (name_value,))
 116                 # Handle case of a control-name with no equal sign
 117                 if keep_blank_values:
 118                     nv.append('')
 119                 else:
 120                     continue
 121             if len(nv[1]) or keep_blank_values:
 122                 name = nv[0].replace('+', ' ')
 123                 name = _unquote(name, encoding=encoding, errors=errors)
 124                 name = _coerce_result(name)
 125                 value = nv[1].replace('+', ' ')
 126                 value = _unquote(value, encoding=encoding, errors=errors)
 127                 value = _coerce_result(value)
 128                 r.append((name, value))
 129         return r
 130
 131     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 132                 encoding='utf-8', errors='replace'):
 133         parsed_result = {}
 134         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 135                         encoding=encoding, errors=errors)
 136         for name, value in pairs:
 137             if name in parsed_result:
 138                 parsed_result[name].append(value)
 139             else:
 140                 parsed_result[name] = [value]
 141         return parsed_result
 142
 143 try:
 144     compat_str = unicode # Python 2
 145 except NameError:
 146     compat_str = str
 147
 148 try:
 149     compat_chr = unichr # Python 2
 150 except NameError:
 151     compat_chr = chr
 152
 153 std_headers = {
 154     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 155     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 156     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 157     'Accept-Encoding': 'gzip, deflate',
 158     'Accept-Language': 'en-us,en;q=0.5',
 159 }
 160
 161 def preferredencoding():
 162     """Get preferred encoding.
 163
 164     Returns the best encoding scheme for the system, based on
 165     locale.getpreferredencoding() and some further tweaks.
 166     """
 167     try:
 168         pref = locale.getpreferredencoding()
 169         u'TEST'.encode(pref)
 170     except:
 171         pref = 'UTF-8'
 172
 173     return pref
 174
 175 if sys.version_info < (3,0):
 176     def compat_print(s):
 177         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 178 else:
 179     def compat_print(s):
 180         assert type(s) == type(u'')
 181         print(s)
 182
 183 # In Python 2.x, json.dump expects a bytestream.
 184 # In Python 3.x, it writes to a character stream
 185 if sys.version_info < (3,0):
 186     def write_json_file(obj, fn):
 187         with open(fn, 'wb') as f:
 188             json.dump(obj, f)
 189 else:
 190     def write_json_file(obj, fn):
 191         with open(fn, 'w', encoding='utf-8') as f:
 192             json.dump(obj, f)
 193
 194 def htmlentity_transform(matchobj):
 195     """Transforms an HTML entity to a character.
 196
 197     This function receives a match object and is intended to be used with
 198     the re.sub() function.
 199     """
 200     entity = matchobj.group(1)
 201
 202     # Known non-numeric HTML entity
 203     if entity in compat_html_entities.name2codepoint:
 204         return compat_chr(compat_html_entities.name2codepoint[entity])
 205
 206     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 207     if mobj is not None:
 208         numstr = mobj.group(1)
 209         if numstr.startswith(u'x'):
 210             base = 16
 211             numstr = u'0%s' % numstr
 212         else:
 213             base = 10
 214         return compat_chr(int(numstr, base))
 215
 216     # Unknown entity in name, return its literal representation
 217     return (u'&%s;' % entity)
 218
 219 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 220 class AttrParser(compat_html_parser.HTMLParser):
 221     """Modified HTMLParser that isolates a tag with the specified attribute"""
 222     def __init__(self, attribute, value):
 223         self.attribute = attribute
 224         self.value = value
 225         self.result = None
 226         self.started = False
 227         self.depth = {}
 228         self.html = None
 229         self.watch_startpos = False
 230         self.error_count = 0
 231         compat_html_parser.HTMLParser.__init__(self)
 232
 233     def error(self, message):
 234         if self.error_count > 10 or self.started:
 235             raise compat_html_parser.HTMLParseError(message, self.getpos())
 236         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 237         self.error_count += 1
 238         self.goahead(1)
 239
 240     def loads(self, html):
 241         self.html = html
 242         self.feed(html)
 243         self.close()
 244
 245     def handle_starttag(self, tag, attrs):
 246         attrs = dict(attrs)
 247         if self.started:
 248             self.find_startpos(None)
 249         if self.attribute in attrs and attrs[self.attribute] == self.value:
 250             self.result = [tag]
 251             self.started = True
 252             self.watch_startpos = True
 253         if self.started:
 254             if not tag in self.depth: self.depth[tag] = 0
 255             self.depth[tag] += 1
 256
 257     def handle_endtag(self, tag):
 258         if self.started:
 259             if tag in self.depth: self.depth[tag] -= 1
 260             if self.depth[self.result[0]] == 0:
 261                 self.started = False
 262                 self.result.append(self.getpos())
 263
 264     def find_startpos(self, x):
 265         """Needed to put the start position of the result (self.result[1])
 266         after the opening tag with the requested id"""
 267         if self.watch_startpos:
 268             self.watch_startpos = False
 269             self.result.append(self.getpos())
 270     handle_entityref = handle_charref = handle_data = handle_comment = \
 271     handle_decl = handle_pi = unknown_decl = find_startpos
 272
 273     def get_result(self):
 274         if self.result is None:
 275             return None
 276         if len(self.result) != 3:
 277             return None
 278         lines = self.html.split('\n')
 279         lines = lines[self.result[1][0]-1:self.result[2][0]]
 280         lines[0] = lines[0][self.result[1][1]:]
 281         if len(lines) == 1:
 282             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 283         lines[-1] = lines[-1][:self.result[2][1]]
 284         return '\n'.join(lines).strip()
 285 # Hack for https://github.com/rg3/youtube-dl/issues/662
 286 if sys.version_info < (2, 7, 3):
 287     AttrParser.parse_endtag = (lambda self, i:
 288         i + len("</scr'+'ipt>")
 289         if self.rawdata[i:].startswith("</scr'+'ipt>")
 290         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 291
 292 def get_element_by_id(id, html):
 293     """Return the content of the tag with the specified ID in the passed HTML document"""
 294     return get_element_by_attribute("id", id, html)
 295
 296 def get_element_by_attribute(attribute, value, html):
 297     """Return the content of the tag with the specified attribute in the passed HTML document"""
 298     parser = AttrParser(attribute, value)
 299     try:
 300         parser.loads(html)
 301     except compat_html_parser.HTMLParseError:
 302         pass
 303     return parser.get_result()
 304
 305
 306 def clean_html(html):
 307     """Clean an HTML snippet into a readable string"""
 308     # Newline vs <br />
 309     html = html.replace('\n', ' ')
 310     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 311     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 312     # Strip html tags
 313     html = re.sub('<.*?>', '', html)
 314     # Replace html entities
 315     html = unescapeHTML(html)
 316     return html.strip()
 317
 318
 319 def sanitize_open(filename, open_mode):
 320     """Try to open the given filename, and slightly tweak it if this fails.
 321
 322     Attempts to open the given filename. If this fails, it tries to change
 323     the filename slightly, step by step, until it's either able to open it
 324     or it fails and raises a final exception, like the standard open()
 325     function.
 326
 327     It returns the tuple (stream, definitive_file_name).
 328     """
 329     try:
 330         if filename == u'-':
 331             if sys.platform == 'win32':
 332                 import msvcrt
 333                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 334             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 335         stream = open(encodeFilename(filename), open_mode)
 336         return (stream, filename)
 337     except (IOError, OSError) as err:
 338         if err.errno in (errno.EACCES,):
 339             raise
 340
 341         # In case of error, try to remove win32 forbidden chars
 342         alt_filename = os.path.join(
 343                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 344                         for path_part in os.path.split(filename)
 345                        )
 346         if alt_filename == filename:
 347             raise
 348         else:
 349             # An exception here should be caught in the caller
 350             stream = open(encodeFilename(filename), open_mode)
 351             return (stream, alt_filename)
 352
 353
 354 def timeconvert(timestr):
 355     """Convert RFC 2822 defined time string into system timestamp"""
 356     timestamp = None
 357     timetuple = email.utils.parsedate_tz(timestr)
 358     if timetuple is not None:
 359         timestamp = email.utils.mktime_tz(timetuple)
 360     return timestamp
 361
 362 def sanitize_filename(s, restricted=False, is_id=False):
 363     """Sanitizes a string so it could be used as part of a filename.
 364     If restricted is set, use a stricter subset of allowed characters.
 365     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 366     """
 367     def replace_insane(char):
 368         if char == '?' or ord(char) < 32 or ord(char) == 127:
 369             return ''
 370         elif char == '"':
 371             return '' if restricted else '\''
 372         elif char == ':':
 373             return '_-' if restricted else ' -'
 374         elif char in '\\/|*<>':
 375             return '_'
 376         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 377             return '_'
 378         if restricted and ord(char) > 127:
 379             return '_'
 380         return char
 381
 382     result = u''.join(map(replace_insane, s))
 383     if not is_id:
 384         while '__' in result:
 385             result = result.replace('__', '_')
 386         result = result.strip('_')
 387         # Common case of "Foreign band name - English song title"
 388         if restricted and result.startswith('-_'):
 389             result = result[2:]
 390         if not result:
 391             result = '_'
 392     return result
 393
 394 def orderedSet(iterable):
 395     """ Remove all duplicates from the input iterable """
 396     res = []
 397     for el in iterable:
 398         if el not in res:
 399             res.append(el)
 400     return res
 401
 402 def unescapeHTML(s):
 403     """
 404     @param s a string
 405     """
 406     assert type(s) == type(u'')
 407
 408     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 409     return result
 410
 411 def encodeFilename(s):
 412     """
 413     @param s The name of the file
 414     """
 415
 416     assert type(s) == type(u'')
 417
 418     # Python 3 has a Unicode API
 419     if sys.version_info >= (3, 0):
 420         return s
 421
 422     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 423         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 424         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 425         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 426         return s
 427     else:
 428         encoding = sys.getfilesystemencoding()
 429         if encoding is None:
 430             encoding = 'utf-8'
 431         return s.encode(encoding, 'ignore')
 432
 433 def decodeOption(optval):
 434     if optval is None:
 435         return optval
 436     if isinstance(optval, bytes):
 437         optval = optval.decode(preferredencoding())
 438
 439     assert isinstance(optval, compat_str)
 440     return optval
 441
 442 def formatSeconds(secs):
 443     if secs > 3600:
 444         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 445     elif secs > 60:
 446         return '%d:%02d' % (secs // 60, secs % 60)
 447     else:
 448         return '%d' % secs
 449
 450 def make_HTTPS_handler(opts):
 451     if sys.version_info < (3,2):
 452         # Python's 2.x handler is very simplistic
 453         return compat_urllib_request.HTTPSHandler()
 454     else:
 455         import ssl
 456         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 457         context.set_default_verify_paths()
 458
 459         context.verify_mode = (ssl.CERT_NONE
 460                                if opts.no_check_certificate
 461                                else ssl.CERT_REQUIRED)
 462         return compat_urllib_request.HTTPSHandler(context=context)
 463
 464 class ExtractorError(Exception):
 465     """Error during info extraction."""
 466     def __init__(self, msg, tb=None):
 467         """ tb, if given, is the original traceback (so that it can be printed out). """
 468         super(ExtractorError, self).__init__(msg)
 469         self.traceback = tb
 470         self.exc_info = sys.exc_info()  # preserve original exception
 471
 472     def format_traceback(self):
 473         if self.traceback is None:
 474             return None
 475         return u''.join(traceback.format_tb(self.traceback))
 476
 477
 478 class DownloadError(Exception):
 479     """Download Error exception.
 480
 481     This exception may be thrown by FileDownloader objects if they are not
 482     configured to continue on errors. They will contain the appropriate
 483     error message.
 484     """
 485     def __init__(self, msg, exc_info=None):
 486         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 487         super(DownloadError, self).__init__(msg)
 488         self.exc_info = exc_info
 489
 490
 491 class SameFileError(Exception):
 492     """Same File exception.
 493
 494     This exception will be thrown by FileDownloader objects if they detect
 495     multiple files would have to be downloaded to the same file on disk.
 496     """
 497     pass
 498
 499
 500 class PostProcessingError(Exception):
 501     """Post Processing exception.
 502
 503     This exception may be raised by PostProcessor's .run() method to
 504     indicate an error in the postprocessing task.
 505     """
 506     def __init__(self, msg):
 507         self.msg = msg
 508
 509 class MaxDownloadsReached(Exception):
 510     """ --max-downloads limit has been reached. """
 511     pass
 512
 513
 514 class UnavailableVideoError(Exception):
 515     """Unavailable Format exception.
 516
 517     This exception will be thrown when a video is requested
 518     in a format that is not available for that video.
 519     """
 520     pass
 521
 522
 523 class ContentTooShortError(Exception):
 524     """Content Too Short exception.
 525
 526     This exception may be raised by FileDownloader objects when a file they
 527     download is too small for what the server announced first, indicating
 528     the connection was probably interrupted.
 529     """
 530     # Both in bytes
 531     downloaded = None
 532     expected = None
 533
 534     def __init__(self, downloaded, expected):
 535         self.downloaded = downloaded
 536         self.expected = expected
 537
 538 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 539     """Handler for HTTP requests and responses.
 540
 541     This class, when installed with an OpenerDirector, automatically adds
 542     the standard headers to every HTTP request and handles gzipped and
 543     deflated responses from web servers. If compression is to be avoided in
 544     a particular request, the original request in the program code only has
 545     to include the HTTP header "Youtubedl-No-Compression", which will be
 546     removed before making the real request.
 547
 548     Part of this code was copied from:
 549
 550     http://techknack.net/python-urllib2-handlers/
 551
 552     Andrew Rowls, the author of that code, agreed to release it to the
 553     public domain.
 554     """
 555
 556     @staticmethod
 557     def deflate(data):
 558         try:
 559             return zlib.decompress(data, -zlib.MAX_WBITS)
 560         except zlib.error:
 561             return zlib.decompress(data)
 562
 563     @staticmethod
 564     def addinfourl_wrapper(stream, headers, url, code):
 565         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 566             return compat_urllib_request.addinfourl(stream, headers, url, code)
 567         ret = compat_urllib_request.addinfourl(stream, headers, url)
 568         ret.code = code
 569         return ret
 570
 571     def http_request(self, req):
 572         for h,v in std_headers.items():
 573             if h in req.headers:
 574                 del req.headers[h]
 575             req.add_header(h, v)
 576         if 'Youtubedl-no-compression' in req.headers:
 577             if 'Accept-encoding' in req.headers:
 578                 del req.headers['Accept-encoding']
 579             del req.headers['Youtubedl-no-compression']
 580         if 'Youtubedl-user-agent' in req.headers:
 581             if 'User-agent' in req.headers:
 582                 del req.headers['User-agent']
 583             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 584             del req.headers['Youtubedl-user-agent']
 585         return req
 586
 587     def http_response(self, req, resp):
 588         old_resp = resp
 589         # gzip
 590         if resp.headers.get('Content-encoding', '') == 'gzip':
 591             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 592             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 593             resp.msg = old_resp.msg
 594         # deflate
 595         if resp.headers.get('Content-encoding', '') == 'deflate':
 596             gz = io.BytesIO(self.deflate(resp.read()))
 597             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 598             resp.msg = old_resp.msg
 599         return resp
 600
 601     https_request = http_request
 602     https_response = http_response
 603
 604 def unified_strdate(date_str):
 605     """Return a string with the date in the format YYYYMMDD"""
 606     upload_date = None
 607     #Replace commas
 608     date_str = date_str.replace(',',' ')
 609     # %z (UTC offset) is only supported in python>=3.2
 610     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 611     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
 612     for expression in format_expressions:
 613         try:
 614             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 615         except:
 616             pass
 617     return upload_date
 618
 619 def date_from_str(date_str):
 620     """
 621     Return a datetime object from a string in the format YYYYMMDD or
 622     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 623     today = datetime.date.today()
 624     if date_str == 'now'or date_str == 'today':
 625         return today
 626     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 627     if match is not None:
 628         sign = match.group('sign')
 629         time = int(match.group('time'))
 630         if sign == '-':
 631             time = -time
 632         unit = match.group('unit')
 633         #A bad aproximation?
 634         if unit == 'month':
 635             unit = 'day'
 636             time *= 30
 637         elif unit == 'year':
 638             unit = 'day'
 639             time *= 365
 640         unit += 's'
 641         delta = datetime.timedelta(**{unit: time})
 642         return today + delta
 643     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 644
 645 class DateRange(object):
 646     """Represents a time interval between two dates"""
 647     def __init__(self, start=None, end=None):
 648         """start and end must be strings in the format accepted by date"""
 649         if start is not None:
 650             self.start = date_from_str(start)
 651         else:
 652             self.start = datetime.datetime.min.date()
 653         if end is not None:
 654             self.end = date_from_str(end)
 655         else:
 656             self.end = datetime.datetime.max.date()
 657         if self.start > self.end:
 658             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 659     @classmethod
 660     def day(cls, day):
 661         """Returns a range that only contains the given day"""
 662         return cls(day,day)
 663     def __contains__(self, date):
 664         """Check if the date is in the range"""
 665         if not isinstance(date, datetime.date):
 666             date = date_from_str(date)
 667         return self.start <= date <= self.end
 668     def __str__(self):
 669         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())