Add display_id field

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index ce3d1690304c8a0b488c2f674c48bb4ef34c86b5..080c9bdfada0440201a85027ddbb34772d09b28f 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,4 +1,5 @@
  import base64
  import base64
+import hashlib
  import json
  import os
  import re
  import json
  import os
  import re
@@ -62,13 +63,15 @@ class InfoExtractor(object):
                      * tbr        Average bitrate of audio and video in KBit/s
                      * abr        Average audio bitrate in KBit/s
                      * acodec     Name of the audio codec in use
                      * tbr        Average bitrate of audio and video in KBit/s
                      * abr        Average audio bitrate in KBit/s
                      * acodec     Name of the audio codec in use
+                    * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
+                    * container  Name of the container format
                      * filesize   The number of bytes, if known in advance
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
                      * filesize   The number of bytes, if known in advance
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
-                                 "http", "https", "rtsp", "rtmp" or so.
+                                 "http", "https", "rtsp", "rtmp", "m3u8" or so.
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                                   by this field.
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                                   by this field.
@@ -85,6 +88,10 @@ class InfoExtractor(object):
  
      The following fields are optional:
  
  
      The following fields are optional:
  
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
      thumbnails:     A list of dictionaries (with the entries "resolution" and
                      "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
      thumbnails:     A list of dictionaries (with the entries "resolution" and
                      "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
@@ -219,6 +226,8 @@ class InfoExtractor(object):
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
@@ -234,6 +243,9 @@ class InfoExtractor(object):
                  url = url_or_request.get_full_url()
              except AttributeError:
                  url = url_or_request
                  url = url_or_request.get_full_url()
              except AttributeError:
                  url = url_or_request
+            if len(url) > 200:
+                h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
+                url = url[:200 - len(h)] + h
              raw_filename = ('%s_%s.dump' % (video_id, url))
              filename = sanitize_filename(raw_filename, restricted=True)
              self.to_screen(u'Saving request to ' + filename)
              raw_filename = ('%s_%s.dump' % (video_id, url))
              filename = sanitize_filename(raw_filename, restricted=True)
              self.to_screen(u'Saving request to ' + filename)
@@ -263,8 +275,11 @@ class InfoExtractor(object):
  
      def _download_json(self, url_or_request, video_id,
                         note=u'Downloading JSON metadata',
  
      def _download_json(self, url_or_request, video_id,
                         note=u'Downloading JSON metadata',
-                       errnote=u'Unable to download JSON metadata'):
+                       errnote=u'Unable to download JSON metadata',
+                       transform_source=None):
          json_string = self._download_webpage(url_or_request, video_id, note, errnote)
          json_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        if transform_source:
+            json_string = transform_source(json_string)
          try:
              return json.loads(json_string)
          except ValueError as ve:
          try:
              return json.loads(json_string)
          except ValueError as ve:
@@ -391,7 +406,7 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
          property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
          property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
@@ -457,7 +472,14 @@ class InfoExtractor(object):
          }
          return RATING_TABLE.get(rating.lower(), None)
  
          }
          return RATING_TABLE.get(rating.lower(), None)
  
+    def _twitter_search_player(self, html):
+        return self._html_search_meta('twitter:player', html,
+            'twitter card player')
+
      def _sort_formats(self, formats):
      def _sort_formats(self, formats):
+        if not formats:
+            raise ExtractorError(u'No video formats found')
+
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext