_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import errno
   5 import gzip
   6 import io
   7 import json
   8 import locale
   9 import os
  10 import re
  11 import sys
  12 import traceback
  13 import zlib
  14 import email.utils
  15 import socket
  16 import datetime
  17
  18 try:
  19     import urllib.request as compat_urllib_request
  20 except ImportError: # Python 2
  21     import urllib2 as compat_urllib_request
  22
  23 try:
  24     import urllib.error as compat_urllib_error
  25 except ImportError: # Python 2
  26     import urllib2 as compat_urllib_error
  27
  28 try:
  29     import urllib.parse as compat_urllib_parse
  30 except ImportError: # Python 2
  31     import urllib as compat_urllib_parse
  32
  33 try:
  34     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  35 except ImportError: # Python 2
  36     from urlparse import urlparse as compat_urllib_parse_urlparse
  37
  38 try:
  39     import urllib.parse as compat_urlparse
  40 except ImportError: # Python 2
  41     import urlparse as compat_urlparse
  42
  43 try:
  44     import http.cookiejar as compat_cookiejar
  45 except ImportError: # Python 2
  46     import cookielib as compat_cookiejar
  47
  48 try:
  49     import html.entities as compat_html_entities
  50 except ImportError: # Python 2
  51     import htmlentitydefs as compat_html_entities
  52
  53 try:
  54     import html.parser as compat_html_parser
  55 except ImportError: # Python 2
  56     import HTMLParser as compat_html_parser
  57
  58 try:
  59     import http.client as compat_http_client
  60 except ImportError: # Python 2
  61     import httplib as compat_http_client
  62
  63 try:
  64     from urllib.error import HTTPError as compat_HTTPError
  65 except ImportError:  # Python 2
  66     from urllib2 import HTTPError as compat_HTTPError
  67
  68 try:
  69     from subprocess import DEVNULL
  70     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  71 except ImportError:
  72     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  73
  74 try:
  75     from urllib.parse import parse_qs as compat_parse_qs
  76 except ImportError: # Python 2
  77     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  78     # Python 2's version is apparently totally broken
  79     def _unquote(string, encoding='utf-8', errors='replace'):
  80         if string == '':
  81             return string
  82         res = string.split('%')
  83         if len(res) == 1:
  84             return string
  85         if encoding is None:
  86             encoding = 'utf-8'
  87         if errors is None:
  88             errors = 'replace'
  89         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  90         pct_sequence = b''
  91         string = res[0]
  92         for item in res[1:]:
  93             try:
  94                 if not item:
  95                     raise ValueError
  96                 pct_sequence += item[:2].decode('hex')
  97                 rest = item[2:]
  98                 if not rest:
  99                     # This segment was just a single percent-encoded character.
 100                     # May be part of a sequence of code units, so delay decoding.
 101                     # (Stored in pct_sequence).
 102                     continue
 103             except ValueError:
 104                 rest = '%' + item
 105             # Encountered non-percent-encoded characters. Flush the current
 106             # pct_sequence.
 107             string += pct_sequence.decode(encoding, errors) + rest
 108             pct_sequence = b''
 109         if pct_sequence:
 110             # Flush the final pct_sequence
 111             string += pct_sequence.decode(encoding, errors)
 112         return string
 113
 114     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 115                 encoding='utf-8', errors='replace'):
 116         qs, _coerce_result = qs, unicode
 117         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 118         r = []
 119         for name_value in pairs:
 120             if not name_value and not strict_parsing:
 121                 continue
 122             nv = name_value.split('=', 1)
 123             if len(nv) != 2:
 124                 if strict_parsing:
 125                     raise ValueError("bad query field: %r" % (name_value,))
 126                 # Handle case of a control-name with no equal sign
 127                 if keep_blank_values:
 128                     nv.append('')
 129                 else:
 130                     continue
 131             if len(nv[1]) or keep_blank_values:
 132                 name = nv[0].replace('+', ' ')
 133                 name = _unquote(name, encoding=encoding, errors=errors)
 134                 name = _coerce_result(name)
 135                 value = nv[1].replace('+', ' ')
 136                 value = _unquote(value, encoding=encoding, errors=errors)
 137                 value = _coerce_result(value)
 138                 r.append((name, value))
 139         return r
 140
 141     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 142                 encoding='utf-8', errors='replace'):
 143         parsed_result = {}
 144         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 145                         encoding=encoding, errors=errors)
 146         for name, value in pairs:
 147             if name in parsed_result:
 148                 parsed_result[name].append(value)
 149             else:
 150                 parsed_result[name] = [value]
 151         return parsed_result
 152
 153 try:
 154     compat_str = unicode # Python 2
 155 except NameError:
 156     compat_str = str
 157
 158 try:
 159     compat_chr = unichr # Python 2
 160 except NameError:
 161     compat_chr = chr
 162
 163 def compat_ord(c):
 164     if type(c) is int: return c
 165     else: return ord(c)
 166
 167 # This is not clearly defined otherwise
 168 compiled_regex_type = type(re.compile(''))
 169
 170 std_headers = {
 171     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 172     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 173     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 174     'Accept-Encoding': 'gzip, deflate',
 175     'Accept-Language': 'en-us,en;q=0.5',
 176 }
 177
 178 def preferredencoding():
 179     """Get preferred encoding.
 180
 181     Returns the best encoding scheme for the system, based on
 182     locale.getpreferredencoding() and some further tweaks.
 183     """
 184     try:
 185         pref = locale.getpreferredencoding()
 186         u'TEST'.encode(pref)
 187     except:
 188         pref = 'UTF-8'
 189
 190     return pref
 191
 192 if sys.version_info < (3,0):
 193     def compat_print(s):
 194         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 195 else:
 196     def compat_print(s):
 197         assert type(s) == type(u'')
 198         print(s)
 199
 200 # In Python 2.x, json.dump expects a bytestream.
 201 # In Python 3.x, it writes to a character stream
 202 if sys.version_info < (3,0):
 203     def write_json_file(obj, fn):
 204         with open(fn, 'wb') as f:
 205             json.dump(obj, f)
 206 else:
 207     def write_json_file(obj, fn):
 208         with open(fn, 'w', encoding='utf-8') as f:
 209             json.dump(obj, f)
 210
 211 if sys.version_info >= (2,7):
 212     def find_xpath_attr(node, xpath, key, val):
 213         """ Find the xpath xpath[@key=val] """
 214         assert re.match(r'^[a-zA-Z]+$', key)
 215         assert re.match(r'^[a-zA-Z@\s]*$', val)
 216         expr = xpath + u"[@%s='%s']" % (key, val)
 217         return node.find(expr)
 218 else:
 219     def find_xpath_attr(node, xpath, key, val):
 220         for f in node.findall(xpath):
 221             if f.attrib.get(key) == val:
 222                 return f
 223         return None
 224
 225 def htmlentity_transform(matchobj):
 226     """Transforms an HTML entity to a character.
 227
 228     This function receives a match object and is intended to be used with
 229     the re.sub() function.
 230     """
 231     entity = matchobj.group(1)
 232
 233     # Known non-numeric HTML entity
 234     if entity in compat_html_entities.name2codepoint:
 235         return compat_chr(compat_html_entities.name2codepoint[entity])
 236
 237     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 238     if mobj is not None:
 239         numstr = mobj.group(1)
 240         if numstr.startswith(u'x'):
 241             base = 16
 242             numstr = u'0%s' % numstr
 243         else:
 244             base = 10
 245         return compat_chr(int(numstr, base))
 246
 247     # Unknown entity in name, return its literal representation
 248     return (u'&%s;' % entity)
 249
 250 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 251 class AttrParser(compat_html_parser.HTMLParser):
 252     """Modified HTMLParser that isolates a tag with the specified attribute"""
 253     def __init__(self, attribute, value):
 254         self.attribute = attribute
 255         self.value = value
 256         self.result = None
 257         self.started = False
 258         self.depth = {}
 259         self.html = None
 260         self.watch_startpos = False
 261         self.error_count = 0
 262         compat_html_parser.HTMLParser.__init__(self)
 263
 264     def error(self, message):
 265         if self.error_count > 10 or self.started:
 266             raise compat_html_parser.HTMLParseError(message, self.getpos())
 267         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 268         self.error_count += 1
 269         self.goahead(1)
 270
 271     def loads(self, html):
 272         self.html = html
 273         self.feed(html)
 274         self.close()
 275
 276     def handle_starttag(self, tag, attrs):
 277         attrs = dict(attrs)
 278         if self.started:
 279             self.find_startpos(None)
 280         if self.attribute in attrs and attrs[self.attribute] == self.value:
 281             self.result = [tag]
 282             self.started = True
 283             self.watch_startpos = True
 284         if self.started:
 285             if not tag in self.depth: self.depth[tag] = 0
 286             self.depth[tag] += 1
 287
 288     def handle_endtag(self, tag):
 289         if self.started:
 290             if tag in self.depth: self.depth[tag] -= 1
 291             if self.depth[self.result[0]] == 0:
 292                 self.started = False
 293                 self.result.append(self.getpos())
 294
 295     def find_startpos(self, x):
 296         """Needed to put the start position of the result (self.result[1])
 297         after the opening tag with the requested id"""
 298         if self.watch_startpos:
 299             self.watch_startpos = False
 300             self.result.append(self.getpos())
 301     handle_entityref = handle_charref = handle_data = handle_comment = \
 302     handle_decl = handle_pi = unknown_decl = find_startpos
 303
 304     def get_result(self):
 305         if self.result is None:
 306             return None
 307         if len(self.result) != 3:
 308             return None
 309         lines = self.html.split('\n')
 310         lines = lines[self.result[1][0]-1:self.result[2][0]]
 311         lines[0] = lines[0][self.result[1][1]:]
 312         if len(lines) == 1:
 313             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 314         lines[-1] = lines[-1][:self.result[2][1]]
 315         return '\n'.join(lines).strip()
 316 # Hack for https://github.com/rg3/youtube-dl/issues/662
 317 if sys.version_info < (2, 7, 3):
 318     AttrParser.parse_endtag = (lambda self, i:
 319         i + len("</scr'+'ipt>")
 320         if self.rawdata[i:].startswith("</scr'+'ipt>")
 321         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 322
 323 def get_element_by_id(id, html):
 324     """Return the content of the tag with the specified ID in the passed HTML document"""
 325     return get_element_by_attribute("id", id, html)
 326
 327 def get_element_by_attribute(attribute, value, html):
 328     """Return the content of the tag with the specified attribute in the passed HTML document"""
 329     parser = AttrParser(attribute, value)
 330     try:
 331         parser.loads(html)
 332     except compat_html_parser.HTMLParseError:
 333         pass
 334     return parser.get_result()
 335
 336
 337 def clean_html(html):
 338     """Clean an HTML snippet into a readable string"""
 339     # Newline vs <br />
 340     html = html.replace('\n', ' ')
 341     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 342     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 343     # Strip html tags
 344     html = re.sub('<.*?>', '', html)
 345     # Replace html entities
 346     html = unescapeHTML(html)
 347     return html.strip()
 348
 349
 350 def sanitize_open(filename, open_mode):
 351     """Try to open the given filename, and slightly tweak it if this fails.
 352
 353     Attempts to open the given filename. If this fails, it tries to change
 354     the filename slightly, step by step, until it's either able to open it
 355     or it fails and raises a final exception, like the standard open()
 356     function.
 357
 358     It returns the tuple (stream, definitive_file_name).
 359     """
 360     try:
 361         if filename == u'-':
 362             if sys.platform == 'win32':
 363                 import msvcrt
 364                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 365             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 366         stream = open(encodeFilename(filename), open_mode)
 367         return (stream, filename)
 368     except (IOError, OSError) as err:
 369         if err.errno in (errno.EACCES,):
 370             raise
 371
 372         # In case of error, try to remove win32 forbidden chars
 373         alt_filename = os.path.join(
 374                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 375                         for path_part in os.path.split(filename)
 376                        )
 377         if alt_filename == filename:
 378             raise
 379         else:
 380             # An exception here should be caught in the caller
 381             stream = open(encodeFilename(filename), open_mode)
 382             return (stream, alt_filename)
 383
 384
 385 def timeconvert(timestr):
 386     """Convert RFC 2822 defined time string into system timestamp"""
 387     timestamp = None
 388     timetuple = email.utils.parsedate_tz(timestr)
 389     if timetuple is not None:
 390         timestamp = email.utils.mktime_tz(timetuple)
 391     return timestamp
 392
 393 def sanitize_filename(s, restricted=False, is_id=False):
 394     """Sanitizes a string so it could be used as part of a filename.
 395     If restricted is set, use a stricter subset of allowed characters.
 396     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 397     """
 398     def replace_insane(char):
 399         if char == '?' or ord(char) < 32 or ord(char) == 127:
 400             return ''
 401         elif char == '"':
 402             return '' if restricted else '\''
 403         elif char == ':':
 404             return '_-' if restricted else ' -'
 405         elif char in '\\/|*<>':
 406             return '_'
 407         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 408             return '_'
 409         if restricted and ord(char) > 127:
 410             return '_'
 411         return char
 412
 413     result = u''.join(map(replace_insane, s))
 414     if not is_id:
 415         while '__' in result:
 416             result = result.replace('__', '_')
 417         result = result.strip('_')
 418         # Common case of "Foreign band name - English song title"
 419         if restricted and result.startswith('-_'):
 420             result = result[2:]
 421         if not result:
 422             result = '_'
 423     return result
 424
 425 def orderedSet(iterable):
 426     """ Remove all duplicates from the input iterable """
 427     res = []
 428     for el in iterable:
 429         if el not in res:
 430             res.append(el)
 431     return res
 432
 433 def unescapeHTML(s):
 434     """
 435     @param s a string
 436     """
 437     assert type(s) == type(u'')
 438
 439     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 440     return result
 441
 442 def encodeFilename(s):
 443     """
 444     @param s The name of the file
 445     """
 446
 447     assert type(s) == type(u'')
 448
 449     # Python 3 has a Unicode API
 450     if sys.version_info >= (3, 0):
 451         return s
 452
 453     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 454         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 455         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 456         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 457         return s
 458     else:
 459         encoding = sys.getfilesystemencoding()
 460         if encoding is None:
 461             encoding = 'utf-8'
 462         return s.encode(encoding, 'ignore')
 463
 464 def decodeOption(optval):
 465     if optval is None:
 466         return optval
 467     if isinstance(optval, bytes):
 468         optval = optval.decode(preferredencoding())
 469
 470     assert isinstance(optval, compat_str)
 471     return optval
 472
 473 def formatSeconds(secs):
 474     if secs > 3600:
 475         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 476     elif secs > 60:
 477         return '%d:%02d' % (secs // 60, secs % 60)
 478     else:
 479         return '%d' % secs
 480
 481 def make_HTTPS_handler(opts):
 482     if sys.version_info < (3,2):
 483         # Python's 2.x handler is very simplistic
 484         return compat_urllib_request.HTTPSHandler()
 485     else:
 486         import ssl
 487         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 488         context.set_default_verify_paths()
 489
 490         context.verify_mode = (ssl.CERT_NONE
 491                                if opts.no_check_certificate
 492                                else ssl.CERT_REQUIRED)
 493         return compat_urllib_request.HTTPSHandler(context=context)
 494
 495 class ExtractorError(Exception):
 496     """Error during info extraction."""
 497     def __init__(self, msg, tb=None, expected=False, cause=None):
 498         """ tb, if given, is the original traceback (so that it can be printed out).
 499         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 500         """
 501
 502         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 503             expected = True
 504         if not expected:
 505             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 506         super(ExtractorError, self).__init__(msg)
 507
 508         self.traceback = tb
 509         self.exc_info = sys.exc_info()  # preserve original exception
 510         self.cause = cause
 511
 512     def format_traceback(self):
 513         if self.traceback is None:
 514             return None
 515         return u''.join(traceback.format_tb(self.traceback))
 516
 517
 518 class DownloadError(Exception):
 519     """Download Error exception.
 520
 521     This exception may be thrown by FileDownloader objects if they are not
 522     configured to continue on errors. They will contain the appropriate
 523     error message.
 524     """
 525     def __init__(self, msg, exc_info=None):
 526         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 527         super(DownloadError, self).__init__(msg)
 528         self.exc_info = exc_info
 529
 530
 531 class SameFileError(Exception):
 532     """Same File exception.
 533
 534     This exception will be thrown by FileDownloader objects if they detect
 535     multiple files would have to be downloaded to the same file on disk.
 536     """
 537     pass
 538
 539
 540 class PostProcessingError(Exception):
 541     """Post Processing exception.
 542
 543     This exception may be raised by PostProcessor's .run() method to
 544     indicate an error in the postprocessing task.
 545     """
 546     def __init__(self, msg):
 547         self.msg = msg
 548
 549 class MaxDownloadsReached(Exception):
 550     """ --max-downloads limit has been reached. """
 551     pass
 552
 553
 554 class UnavailableVideoError(Exception):
 555     """Unavailable Format exception.
 556
 557     This exception will be thrown when a video is requested
 558     in a format that is not available for that video.
 559     """
 560     pass
 561
 562
 563 class ContentTooShortError(Exception):
 564     """Content Too Short exception.
 565
 566     This exception may be raised by FileDownloader objects when a file they
 567     download is too small for what the server announced first, indicating
 568     the connection was probably interrupted.
 569     """
 570     # Both in bytes
 571     downloaded = None
 572     expected = None
 573
 574     def __init__(self, downloaded, expected):
 575         self.downloaded = downloaded
 576         self.expected = expected
 577
 578 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 579     """Handler for HTTP requests and responses.
 580
 581     This class, when installed with an OpenerDirector, automatically adds
 582     the standard headers to every HTTP request and handles gzipped and
 583     deflated responses from web servers. If compression is to be avoided in
 584     a particular request, the original request in the program code only has
 585     to include the HTTP header "Youtubedl-No-Compression", which will be
 586     removed before making the real request.
 587
 588     Part of this code was copied from:
 589
 590     http://techknack.net/python-urllib2-handlers/
 591
 592     Andrew Rowls, the author of that code, agreed to release it to the
 593     public domain.
 594     """
 595
 596     @staticmethod
 597     def deflate(data):
 598         try:
 599             return zlib.decompress(data, -zlib.MAX_WBITS)
 600         except zlib.error:
 601             return zlib.decompress(data)
 602
 603     @staticmethod
 604     def addinfourl_wrapper(stream, headers, url, code):
 605         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 606             return compat_urllib_request.addinfourl(stream, headers, url, code)
 607         ret = compat_urllib_request.addinfourl(stream, headers, url)
 608         ret.code = code
 609         return ret
 610
 611     def http_request(self, req):
 612         for h,v in std_headers.items():
 613             if h in req.headers:
 614                 del req.headers[h]
 615             req.add_header(h, v)
 616         if 'Youtubedl-no-compression' in req.headers:
 617             if 'Accept-encoding' in req.headers:
 618                 del req.headers['Accept-encoding']
 619             del req.headers['Youtubedl-no-compression']
 620         if 'Youtubedl-user-agent' in req.headers:
 621             if 'User-agent' in req.headers:
 622                 del req.headers['User-agent']
 623             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 624             del req.headers['Youtubedl-user-agent']
 625         return req
 626
 627     def http_response(self, req, resp):
 628         old_resp = resp
 629         # gzip
 630         if resp.headers.get('Content-encoding', '') == 'gzip':
 631             content = resp.read()
 632             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 633             try:
 634                 uncompressed = io.BytesIO(gz.read())
 635             except IOError as original_ioerror:
 636                 # There may be junk add the end of the file
 637                 # See http://stackoverflow.com/q/4928560/35070 for details
 638                 for i in range(1, 1024):
 639                     try:
 640                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 641                         uncompressed = io.BytesIO(gz.read())
 642                     except IOError:
 643                         continue
 644                     break
 645                 else:
 646                     raise original_ioerror
 647             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 648             resp.msg = old_resp.msg
 649         # deflate
 650         if resp.headers.get('Content-encoding', '') == 'deflate':
 651             gz = io.BytesIO(self.deflate(resp.read()))
 652             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 653             resp.msg = old_resp.msg
 654         return resp
 655
 656     https_request = http_request
 657     https_response = http_response
 658
 659 def unified_strdate(date_str):
 660     """Return a string with the date in the format YYYYMMDD"""
 661     upload_date = None
 662     #Replace commas
 663     date_str = date_str.replace(',',' ')
 664     # %z (UTC offset) is only supported in python>=3.2
 665     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 666     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
 667     for expression in format_expressions:
 668         try:
 669             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 670         except:
 671             pass
 672     return upload_date
 673
 674 def determine_ext(url, default_ext=u'unknown_video'):
 675     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 676     if re.match(r'^[A-Za-z0-9]+$', guess):
 677         return guess
 678     else:
 679         return default_ext
 680
 681 def subtitles_filename(filename, sub_lang, sub_format):
 682     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 683
 684 def date_from_str(date_str):
 685     """
 686     Return a datetime object from a string in the format YYYYMMDD or
 687     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 688     today = datetime.date.today()
 689     if date_str == 'now'or date_str == 'today':
 690         return today
 691     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 692     if match is not None:
 693         sign = match.group('sign')
 694         time = int(match.group('time'))
 695         if sign == '-':
 696             time = -time
 697         unit = match.group('unit')
 698         #A bad aproximation?
 699         if unit == 'month':
 700             unit = 'day'
 701             time *= 30
 702         elif unit == 'year':
 703             unit = 'day'
 704             time *= 365
 705         unit += 's'
 706         delta = datetime.timedelta(**{unit: time})
 707         return today + delta
 708     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 709
 710 class DateRange(object):
 711     """Represents a time interval between two dates"""
 712     def __init__(self, start=None, end=None):
 713         """start and end must be strings in the format accepted by date"""
 714         if start is not None:
 715             self.start = date_from_str(start)
 716         else:
 717             self.start = datetime.datetime.min.date()
 718         if end is not None:
 719             self.end = date_from_str(end)
 720         else:
 721             self.end = datetime.datetime.max.date()
 722         if self.start > self.end:
 723             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 724     @classmethod
 725     def day(cls, day):
 726         """Returns a range that only contains the given day"""
 727         return cls(day,day)
 728     def __contains__(self, date):
 729         """Check if the date is in the range"""
 730         if not isinstance(date, datetime.date):
 731             date = date_from_str(date)
 732         return self.start <= date <= self.end
 733     def __str__(self):
 734         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())