X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=e82e3998a7c30d8ae14c9561b51946df0cbebcff;hb=dc03a42537cba83597ca8acb2bbe03f686f2136c;hp=03566d22357a086bb1019bb1187db9b5425b83f0;hpb=347de4931cb3e496fb7b1dfb0314c213f44cce6b;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 03566d223..e82e3998a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,7 +35,6 @@ import zlib from .compat import ( compat_basestring, compat_chr, - compat_getenv, compat_html_entities, compat_http_client, compat_parse_qs, @@ -54,7 +53,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -62,6 +61,11 @@ std_headers = { } +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + + def preferredencoding(): """Get preferred encoding. @@ -248,15 +252,12 @@ def sanitize_open(filename, open_mode): raise # In case of error, try to remove win32 forbidden chars - alt_filename = os.path.join( - re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) - for path_part in os.path.split(filename) - ) + alt_filename = sanitize_path(filename) if alt_filename == filename: raise else: # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) + stream = open(encodeFilename(alt_filename), open_mode) return (stream, alt_filename) @@ -299,11 +300,32 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') if not result: result = '_' return result +def sanitize_path(s): + """Sanitizes and normalizes path on Windows""" + if sys.platform != 'win32': + return s + drive, _ = os.path.splitdrive(s) + unc, _ = os.path.splitunc(s) + unc_or_drive = unc or drive + norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) + if unc_or_drive: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + for path_part in norm_path] + if unc_or_drive: + sanitized_path.insert(0, unc_or_drive + os.path.sep) + return os.path.join(*sanitized_path) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -666,26 +688,27 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) -def parse_iso8601(date_str, delimiter='T'): +def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ if date_str is None: return None - m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): + if timezone is None: + m = re.search( + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: timezone = datetime.timedelta() else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt.timetuple()) @@ -894,8 +917,8 @@ def _windows_write_string(s, out): def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or + GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False @@ -1165,30 +1188,21 @@ def parse_filesize(s): return int(float(num_str) * mult) -def get_term_width(): - columns = compat_getenv('COLUMNS', None) - if columns: - return int(columns) +def month_by_name(name): + """ Return the number of a month by (locale-independently) English name """ try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - return int(out.split()[1]) - except: - pass - return None + return ENGLISH_MONTH_NAMES.index(name) + 1 + except ValueError: + return None -def month_by_name(name): - """ Return the number of a month by (locale-independently) English name """ +def month_by_abbreviation(abbrev): + """ Return the number of a month by (locale-independently) English + abbreviations """ - ENGLISH_NAMES = [ - 'January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', 'December'] try: - return ENGLISH_NAMES.index(name) + 1 + return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 except ValueError: return None @@ -1275,6 +1289,7 @@ def parse_duration(s): (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1293,10 +1308,14 @@ def parse_duration(s): return float_or_none(m.group('only_hours'), invscale=60 * 60) if m.group('secs'): res += int(m.group('secs')) + if m.group('mins_reversed'): + res += int(m.group('mins_reversed')) * 60 if m.group('mins'): res += int(m.group('mins')) * 60 if m.group('hours'): res += int(m.group('hours')) * 60 * 60 + if m.group('hours_reversed'): + res += int(m.group('hours_reversed')) * 60 * 60 if m.group('days'): res += int(m.group('days')) * 24 * 60 * 60 if m.group('ms'): @@ -1547,8 +1566,8 @@ def js_to_json(code): return '"%s"' % v res = re.sub(r'''(?x) - "(?:[^"\\]*(?:\\\\|\\")?)*"| - '(?:[^'\\]*(?:\\\\|\\')?)*'| + "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| [a-zA-Z_][.a-zA-Z_0-9]* ''', fix_kv, code) res = re.sub(r',(\s*\])', lambda m: m.group(1), res) @@ -1603,6 +1622,15 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) +def mimetype2ext(mt): + _, _, res = mt.rpartition('/') + + return { + 'x-ms-wmv': 'wmv', + 'x-mp4-fragmented': 'mp4', + }.get(res, res) + + def urlhandle_detect_ext(url_handle): try: url_handle.headers @@ -1618,7 +1646,7 @@ def urlhandle_detect_ext(url_handle): if e: return e - return getheader('Content-Type').split("/")[1] + return mimetype2ext(getheader('Content-Type')) def age_restricted(content_limit, age_limit): @@ -1755,3 +1783,39 @@ def match_filter_func(filter_str): video_title = info_dict.get('title', info_dict.get('id', 'video')) return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) return _match_func + + +class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + return compat_urllib_request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + return compat_urllib_request.ProxyHandler.proxy_open( + self, req, proxy, type) + + +def url_sanitize_consecutive_slashes(url): + """Sanitize URLs with consecutive slashes + + For example, transform both + http://hostname/foo//bar/filename.html + and + http://hostname//foo/bar/filename.html + into + http://hostname/foo/bar/filename.html + """ + parsed_url = list(compat_urlparse.urlparse(url)) + parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) + return compat_urlparse.urlunparse(parsed_url)