Merge branch 'patch-1' of https://github.com/tuexss/youtube-dl into tuexss-patch-1

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 0f49d602eab1296dced15449a949c26dc4408d66..52f0dd09aac2ef0103212086a280fc317b36b82d 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -75,7 +75,7 @@ def preferredencoding():
      try:
          pref = locale.getpreferredencoding()
          'TEST'.encode(pref)
-    except:
+    except Exception:
          pref = 'UTF-8'
  
      return pref
@@ -127,7 +127,7 @@ def write_json_file(obj, fn):
              except OSError:
                  pass
          os.rename(tf.name, fn)
-    except:
+    except Exception:
          try:
              os.remove(tf.name)
          except OSError:
@@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):
              raise
  
          # In case of error, try to remove win32 forbidden chars
-        alt_filename = os.path.join(
-            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
-            for path_part in os.path.split(filename)
-        )
+        alt_filename = sanitize_path(filename)
          if alt_filename == filename:
              raise
          else:
              # An exception here should be caught in the caller
-            stream = open(encodeFilename(filename), open_mode)
+            stream = open(encodeFilename(alt_filename), open_mode)
              return (stream, alt_filename)
  
  
@@ -322,13 +319,20 @@ def sanitize_path(s):
      if unc_or_drive:
          norm_path.pop(0)
      sanitized_path = [
-        re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
+        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
          for path_part in norm_path]
      if unc_or_drive:
          sanitized_path.insert(0, unc_or_drive + os.path.sep)
      return os.path.join(*sanitized_path)
  
  
+def sanitize_url_path_consecutive_slashes(url):
+    """Collapses consecutive slashes in URLs' path"""
+    parsed_url = list(compat_urlparse.urlparse(url))
+    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
+    return compat_urlparse.urlunparse(parsed_url)
+
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -344,7 +348,7 @@ def _htmlentity_transform(entity):
      if entity in compat_html_entities.name2codepoint:
          return compat_chr(compat_html_entities.name2codepoint[entity])
  
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
      if mobj is not None:
          numstr = mobj.group(1)
          if numstr.startswith('x'):
@@ -726,7 +730,8 @@ def unified_strdate(date_str, day_first=True):
      # Replace commas
      date_str = date_str.replace(',', ' ')
      # %z (UTC offset) is only supported in python>=3.2
-    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
+    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
+        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
      # Remove AM/PM + timezone
      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  
@@ -755,6 +760,7 @@ def unified_strdate(date_str, day_first=True):
      ]
      if day_first:
          format_expressions.extend([
+            '%d-%m-%Y',
              '%d.%m.%Y',
              '%d/%m/%Y',
              '%d/%m/%y',
@@ -762,6 +768,7 @@ def unified_strdate(date_str, day_first=True):
          ])
      else:
          format_expressions.extend([
+            '%m-%d-%Y',
              '%m.%d.%Y',
              '%m/%d/%Y',
              '%m/%d/%y',
@@ -1573,7 +1580,7 @@ def js_to_json(code):
          '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
          [a-zA-Z_][.a-zA-Z_0-9]*
          ''', fix_kv, code)
-    res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
+    res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
      return res