[extractor/common] Clarify preference key in formats

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 56c54a5ce2627ecc9d488fbe12901c689fefeb3b..78f238f8428c5df0fce2dcc26f66b1301595e62b 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -66,14 +66,15 @@ class InfoExtractor(object):
                      * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
                      * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
+                    * container  Name of the container format
                      * filesize   The number of bytes, if known in advance
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
                      * filesize   The number of bytes, if known in advance
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
-                                 "http", "https", "rtsp", "rtmp" or so.
+                                 "http", "https", "rtsp", "rtmp", "m3u8" or so.
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
-                                 by this field.
+                                 by this field, regardless of all other values.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
                      * quality    Order number of the video quality of this
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
                      * quality    Order number of the video quality of this
@@ -87,12 +88,18 @@ class InfoExtractor(object):
  
      The following fields are optional:
  
  
      The following fields are optional:
  
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
      thumbnails:     A list of dictionaries (with the entries "resolution" and
                      "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
      description:    One-line video description.
      uploader:       Full name of the video uploader.
      thumbnails:     A list of dictionaries (with the entries "resolution" and
                      "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
      description:    One-line video description.
      uploader:       Full name of the video uploader.
+    timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
      upload_date:    Video upload date (YYYYMMDD).
+                    If not explicitly set, calculated from timestamp.
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      subtitles:      The subtitle file contents as a dictionary in the format
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      subtitles:      The subtitle file contents as a dictionary in the format
@@ -113,9 +120,6 @@ class InfoExtractor(object):
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
  
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
  
-    _real_extract() must return a *list* of information dictionaries as
-    described above.
-
      Finally, the _WORKING attribute should be set to False for broken IEs
      in order to warn the users and skip the tests.
      """
      Finally, the _WORKING attribute should be set to False for broken IEs
      in order to warn the users and skip the tests.
      """
@@ -221,6 +225,8 @@ class InfoExtractor(object):
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
@@ -237,7 +243,7 @@ class InfoExtractor(object):
              except AttributeError:
                  url = url_or_request
              if len(url) > 200:
              except AttributeError:
                  url = url_or_request
              if len(url) > 200:
-                h = hashlib.md5(url).hexdigest()
+                h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
                  url = url[:200 - len(h)] + h
              raw_filename = ('%s_%s.dump' % (video_id, url))
              filename = sanitize_filename(raw_filename, restricted=True)
                  url = url[:200 - len(h)] + h
              raw_filename = ('%s_%s.dump' % (video_id, url))
              filename = sanitize_filename(raw_filename, restricted=True)
@@ -268,8 +274,11 @@ class InfoExtractor(object):
  
      def _download_json(self, url_or_request, video_id,
                         note=u'Downloading JSON metadata',
  
      def _download_json(self, url_or_request, video_id,
                         note=u'Downloading JSON metadata',
-                       errnote=u'Unable to download JSON metadata'):
+                       errnote=u'Unable to download JSON metadata',
+                       transform_source=None):
          json_string = self._download_webpage(url_or_request, video_id, note, errnote)
          json_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        if transform_source:
+            json_string = transform_source(json_string)
          try:
              return json.loads(json_string)
          except ValueError as ve:
          try:
              return json.loads(json_string)
          except ValueError as ve:
@@ -396,7 +405,7 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
          property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
          property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
@@ -426,14 +435,14 @@ class InfoExtractor(object):
          if secure: regexes = self._og_regexes('video:secure_url') + regexes
          return self._html_search_regex(regexes, html, name, **kargs)
  
          if secure: regexes = self._og_regexes('video:secure_url') + regexes
          return self._html_search_regex(regexes, html, name, **kargs)
  
-    def _html_search_meta(self, name, html, display_name=None):
+    def _html_search_meta(self, name, html, display_name=None, fatal=False):
          if display_name is None:
              display_name = name
          return self._html_search_regex(
              r'''(?ix)<meta
                      (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                      [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
          if display_name is None:
              display_name = name
          return self._html_search_regex(
              r'''(?ix)<meta
                      (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                      [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=False)
+            html, display_name, fatal=fatal)
  
      def _dc_search_uploader(self, html):
          return self._html_search_meta('dc.creator', html, 'uploader')
  
      def _dc_search_uploader(self, html):
          return self._html_search_meta('dc.creator', html, 'uploader')
@@ -462,7 +471,14 @@ class InfoExtractor(object):
          }
          return RATING_TABLE.get(rating.lower(), None)
  
          }
          return RATING_TABLE.get(rating.lower(), None)
  
+    def _twitter_search_player(self, html):
+        return self._html_search_meta('twitter:player', html,
+            'twitter card player')
+
      def _sort_formats(self, formats):
      def _sort_formats(self, formats):
+        if not formats:
+            raise ExtractorError(u'No video formats found')
+
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext