[InfoExtractor/common] Correct and test meta tag matching

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index d302fe45fdea0bc7556fdbda4f321d64d86c2c7c..d703893dcfef1e772f1e294b0c08430ee6c15db3 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,6 +21,7 @@ from ..compat import (
      compat_str,
  )
  from ..utils import (
+    age_restricted,
      clean_html,
      compiled_regex_type,
      ExtractorError,
@@ -40,7 +41,7 @@ class InfoExtractor(object):
      information about the video (or videos) the URL refers to. This
      information includes the real video URL, the video title, author and
      others. The information is stored in a dictionary which is then
-    passed to the FileDownloader. The FileDownloader processes this
+    passed to the YoutubeDL. The YoutubeDL processes this
      information possibly downloading the video to the file system, among
      other possible outcomes.
  
@@ -92,6 +93,8 @@ class InfoExtractor(object):
                                   by this field, regardless of all other values.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
+                                 < -1000 to hide the format (if there is
+                                    another one which is strictly better)
                      * language_preference  Is this in the correct requested
                                   language?
                                   10 if it's what the URL is about,
@@ -589,9 +592,9 @@ class InfoExtractor(object):
          if display_name is None:
              display_name = name
          return self._html_search_regex(
-            r'''(?ix)<meta
+            r'''(?isx)<meta
                      (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
-                    [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
              html, display_name, fatal=fatal, group='content', **kwargs)
  
      def _dc_search_uploader(self, html):
@@ -875,6 +878,35 @@ class InfoExtractor(object):
              None, '/', True, False, expire_time, '', None, None, None)
          self._downloader.cookiejar.set_cookie(cookie)
  
+    def get_testcases(self, include_onlymatching=False):
+        t = getattr(self, '_TEST', None)
+        if t:
+            assert not hasattr(self, '_TESTS'), \
+                '%s has _TEST and _TESTS' % type(self).__name__
+            tests = [t]
+        else:
+            tests = getattr(self, '_TESTS', [])
+        for t in tests:
+            if not include_onlymatching and t.get('only_matching', False):
+                continue
+            t['name'] = type(self).__name__[:-len('IE')]
+            yield t
+
+    def is_suitable(self, age_limit):
+        """ Test whether the extractor is generally suitable for the given
+        age limit (i.e. pornographic sites are not, all others usually are) """
+
+        any_restricted = False
+        for tc in self.get_testcases(include_onlymatching=False):
+            if 'playlist' in tc:
+                tc = tc['playlist'][0]
+            is_restricted = age_restricted(
+                tc.get('info_dict', {}).get('age_limit'), age_limit)
+            if not is_restricted:
+                return True
+            any_restricted = any_restricted or is_restricted
+        return not any_restricted
+
  
  class SearchInfoExtractor(InfoExtractor):
      """