X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=2a5a85dc67b4f7a57d04d4f21c1608aa2c47f7f3;hb=69a0c470b5cbcb789ef0358b7f13a18bf7564fc1;hp=77a13aea533d17aa57f17e01929ca3a276787844;hpb=f8b362739e4f469b501aa804beb95cf1cfb1c916;p=youtube-dl

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 77a13aea5..2a5a85dc6 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -35,6 +35,8 @@ class InfoExtractor(object):
     title:          Video title, unescaped.
     ext:            Video filename extension.
 
+    Instead of url and ext, formats can also specified.
+
     The following fields are optional:
 
     format:         The video format, defaults to ext (used for --get-format)
@@ -52,8 +54,20 @@ class InfoExtractor(object):
     view_count:     How many users have watched the video on the platform.
     urlhandle:      [internal] The urlHandle to be used to download the file,
                     like returned by urllib.request.urlopen
-
-    The fields should all be Unicode strings.
+    age_limit:      Age restriction for the video, as an integer (years)
+    formats:        A list of dictionaries for each format available, it must
+                    be ordered from worst to best quality. Potential fields:
+                    * url       Mandatory. The URL of the video file
+                    * ext       Will be calculated from url if missing
+                    * format    A human-readable description of the format
+                                ("mp4 container with h264/opus").
+                                Calculated from width and height if missing.
+                    * format_id A short description of the format
+                                ("mp4_h264_opus" or "19")
+                    * width     Width of the video, if known
+                    * height    Height of the video, if known
+
+    Unless mentioned otherwise, the fields should be Unicode strings.
 
     Subclasses of this one should re-define the _real_initialize() and
     _real_extract() methods and define a _VALID_URL regexp.
@@ -145,12 +159,17 @@ class InfoExtractor(object):
 
         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
         content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
         else:
-            encoding = 'utf-8'
-        webpage_bytes = urlh.read()
+            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+                          webpage_bytes[:1024])
+            if m:
+                encoding = m.group(1).decode('ascii')
+            else:
+                encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
@@ -300,6 +319,15 @@ class InfoExtractor(object):
                                         self._og_regex('video')],
                                        html, name, **kargs)
 
+    def _rta_search(self, html):
+        # See http://www.rtalabel.org/index.php?content=howtofaq#single
+        if re.search(r'(?ix)<meta\s+name="rating"\s+'
+                     r'     content="RTA-5042-1996-1400-1577-RTA"',
+                     html):
+            return 18
+        return 0
+
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.