PEP8: applied even more rules

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index f43a0a569a3d90d555a27cece8ac3e68951c5106..3c2d46dd5c8ee780a04cd0f3fedb05e33707b71c 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -12,13 +12,14 @@ import sys
  import time
  import xml.etree.ElementTree
  
-from ..utils import (
+from ..compat import (
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse_urlparse,
      compat_urlparse,
      compat_str,
-
+)
+from ..utils import (
      clean_html,
      compiled_regex_type,
      ExtractorError,
@@ -42,7 +43,11 @@ class InfoExtractor(object):
      information possibly downloading the video to the file system, among
      other possible outcomes.
  
-    The dictionaries must include the following fields:
+    The type field determines the the type of the result.
+    By far the most common value (and the default if _type is missing) is
+    "video", which indicates a single video.
+
+    For a video, the dictionaries must include the following fields:
  
      id:             Video identifier.
      title:          Video title, unescaped.
@@ -72,6 +77,7 @@ class InfoExtractor(object):
                      * acodec     Name of the audio codec in use
                      * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
+                    * fps        Frame rate
                      * vcodec     Name of the video codec in use
                      * container  Name of the container format
                      * filesize   The number of bytes, if known in advance
@@ -85,10 +91,19 @@ class InfoExtractor(object):
                                   by this field, regardless of all other values.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
+                    * language_preference  Is this in the correct requested
+                                 language?
+                                 10 if it's what the URL is about,
+                                 -1 for default (don't know),
+                                 -10 otherwise, other values reserved for now.
                      * quality    Order number of the video quality of this
                                   format, irrespective of the file format.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
+                    * source_preference  Order number for this video source
+                                  (quality takes higher priority)
+                                 -1 for default (order by other properties),
+                                 -2 or smaller for less than default.
                      * http_referer  HTTP Referer header value to set.
                      * http_method  HTTP method to use for the download.
                      * http_headers  A dictionary of additional HTTP headers
@@ -138,6 +153,40 @@ class InfoExtractor(object):
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
+    Unless mentioned otherwise, None is equivalent to absence of information.
+
+
+    _type "playlist" indicates multiple videos.
+    There must be a key "entries", which is a list or a PagedList object, each
+    element of which is a valid dictionary under this specfication.
+
+    Additionally, playlists can have "title" and "id" attributes with the same
+    semantics as videos (see above).
+
+
+    _type "multi_video" indicates that there are multiple videos that
+    form a single show, for examples multiple acts of an opera or TV episode.
+    It must have an entries key like a playlist and contain all the keys
+    required for a video at the same time.
+
+
+    _type "url" indicates that the video must be extracted from another
+    location, possibly by a different extractor. Its only required key is:
+    "url" - the next URL to extract.
+
+    Additionally, it may have properties believed to be identical to the
+    resolved entity, for example "title" if the title of the referred video is
+    known ahead of time.
+
+
+    _type "url_transparent" entities have the same specification as "url", but
+    indicate that the given additional information is more precise than the one
+    associated with the resolved URL.
+    This is useful when a site employs a video service that hosts the video and
+    its technical metadata, but that video service does not embed a useful
+    title, description etc.
+
+
      Subclasses of this one should re-define the _real_initialize() and
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
@@ -236,7 +285,6 @@ class InfoExtractor(object):
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns a tuple (page content as string, URL handle) """
-
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, (compat_str, str)):
              url_or_request = url_or_request.partition('#')[0]
@@ -245,6 +293,10 @@ class InfoExtractor(object):
          if urlh is False:
              assert not fatal
              return False
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        return (content, urlh)
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
          content_type = urlh.headers.get('Content-Type', '')
          webpage_bytes = urlh.read()
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -279,6 +331,12 @@ class InfoExtractor(object):
              raw_filename = basen + '.dump'
              filename = sanitize_filename(raw_filename, restricted=True)
              self.to_screen('Saving request to ' + filename)
+            # Working around MAX_PATH limitation on Windows (see
+            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+            if os.name == 'nt':
+                absfilepath = os.path.abspath(filename)
+                if len(absfilepath) > 259:
+                    filename = '\\\\?\\' + absfilepath
              with open(filename, 'wb') as outf:
                  outf.write(webpage_bytes)
  
@@ -297,7 +355,7 @@ class InfoExtractor(object):
                  msg += ' Visit %s for more details' % blocked_iframe
              raise ExtractorError(msg, expected=True)
  
-        return (content, urlh)
+        return content
  
      def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns the data of the page as a string """
@@ -334,7 +392,11 @@ class InfoExtractor(object):
          try:
              return json.loads(json_string)
          except ValueError as ve:
-            raise ExtractorError('Failed to download JSON', cause=ve)
+            errmsg = '%s: Failed to parse JSON ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
  
      def report_warning(self, msg, video_id=None):
          idstr = '' if video_id is None else '%s: ' % video_id
@@ -361,17 +423,18 @@ class InfoExtractor(object):
          """Report attempt to log in."""
          self.to_screen('Logging in')
  
-    #Methods for following #608
+    # Methods for following #608
      @staticmethod
      def url_result(url, ie=None, video_id=None):
          """Returns a url that points to a page that should be processed"""
-        #TODO: ie should be the class used for getting the info
+        # TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
                        'url': url,
                        'ie_key': ie}
          if video_id is not None:
              video_info['id'] = video_id
          return video_info
+
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None):
          """Returns a playlist"""
@@ -383,7 +446,7 @@ class InfoExtractor(object):
              video_info['title'] = playlist_title
          return video_info
  
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
          """
          Perform a regex search on the given string, using a single or a list of
          patterns returning the first matching group.
@@ -404,22 +467,25 @@ class InfoExtractor(object):
              _name = name
  
          if mobj:
-            # return the first matching group
-            return next(g for g in mobj.groups() if g is not None)
+            if group is None:
+                # return the first matching group
+                return next(g for g in mobj.groups() if g is not None)
+            else:
+                return mobj.group(group)
          elif default is not _NO_DEFAULT:
              return default
          elif fatal:
              raise RegexNotFoundError('Unable to extract %s' % _name)
          else:
              self._downloader.report_warning('unable to extract %s; '
-                'please report this issue on http://yt-dl.org/bug' % _name)
+                                            'please report this issue on http://yt-dl.org/bug' % _name)
              return None
  
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
          """
          Like _search_regex, but strips HTML tags and unescapes entities.
          """
-        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
          if res:
              return clean_html(res).strip()
          else:
@@ -452,7 +518,7 @@ class InfoExtractor(object):
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
                  self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
-        
+
          return (username, password)
  
      def _get_tfa_info(self):
@@ -513,9 +579,9 @@ class InfoExtractor(object):
              display_name = name
          return self._html_search_regex(
              r'''(?ix)<meta
-                    (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
-                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=fatal, **kwargs)
+                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+                    [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+            html, display_name, fatal=fatal, group='content', **kwargs)
  
      def _dc_search_uploader(self, html):
          return self._html_search_meta('dc.creator', html, 'uploader')
@@ -546,7 +612,7 @@ class InfoExtractor(object):
  
      def _twitter_search_player(self, html):
          return self._html_search_meta('twitter:player', html,
-            'twitter card player')
+                                      'twitter card player')
  
      def _sort_formats(self, formats):
          if not formats:
@@ -591,6 +657,7 @@ class InfoExtractor(object):
  
              return (
                  preference,
+                f.get('language_preference') if f.get('language_preference') is not None else -1,
                  f.get('quality') if f.get('quality') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
@@ -599,14 +666,16 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
+                f.get('fps') if f.get('fps') is not None else -1,
                  f.get('filesize') if f.get('filesize') is not None else -1,
                  f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+                f.get('source_preference') if f.get('source_preference') is not None else -1,
                  f.get('format_id'),
              )
          formats.sort(key=_formats_key)
  
      def http_scheme(self):
-        """ Either "https:" or "https:", depending on the user's preferences """
+        """ Either "http:" or "https:", depending on the user's preferences """
          return (
              'http:'
              if self._downloader.params.get('prefer_insecure', False)
@@ -669,7 +738,10 @@ class InfoExtractor(object):
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
-        m3u8_doc = self._download_webpage(m3u8_url, video_id)
+        m3u8_doc = self._download_webpage(
+            m3u8_url, video_id,
+            note='Downloading m3u8 information',
+            errnote='Failed to download m3u8 information')
          last_info = None
          kv_rex = re.compile(
              r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')