[extractor/common] Encode the url when calculating the md5 with `—write-pages` option
[youtube-dl] / youtube_dl / extractor / common.py
index 692d828da9ef9739e1b05908e6a9f39259b0940b..3cf742a3b6f49751ee4827a7d1f5cde3a59a884c 100644 (file)
@@ -63,8 +63,10 @@ class InfoExtractor(object):
                     * tbr        Average bitrate of audio and video in KBit/s
                     * abr        Average audio bitrate in KBit/s
                     * acodec     Name of the audio codec in use
+                    * asr        Audio sampling rate in Hertz
                     * vbr        Average video bitrate in KBit/s
                     * vcodec     Name of the video codec in use
+                    * container  Name of the container format
                     * filesize   The number of bytes, if known in advance
                     * player_url SWF Player URL (used for rtmpdump).
                     * protocol   The protocol that will be used for the actual
@@ -220,6 +222,8 @@ class InfoExtractor(object):
                           webpage_bytes[:1024])
             if m:
                 encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
             else:
                 encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):
@@ -236,7 +240,7 @@ class InfoExtractor(object):
             except AttributeError:
                 url = url_or_request
             if len(url) > 200:
-                h = hashlib.md5(url).hexdigest()
+                h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
                 url = url[:200 - len(h)] + h
             raw_filename = ('%s_%s.dump' % (video_id, url))
             filename = sanitize_filename(raw_filename, restricted=True)