Fix typos

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index b86d0652338457a65c4fc655d5e0406aa42850b0..b05b22a94b0bac346e776ef5c3d42305cf5f4163 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,20 +10,17 @@ import re
  import socket
  import sys
  import time
-import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
      compat_getpass,
-    compat_HTTPError,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
      compat_urlparse,
      compat_str,
+    compat_etree_fromstring,
  )
  from ..utils import (
      NO_DEFAULT,
@@ -32,17 +29,20 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
+    error_to_compat_str,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
+    sanitized_Request,
      unescapeHTML,
      unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
+    determine_protocol,
  )
  
  
@@ -108,8 +108,9 @@ class InfoExtractor(object):
                                   -2 or smaller for less than default.
                                   < -1000 to hide the format (if there is
                                      another one which is strictly better)
-                    * language_preference  Is this in the correct requested
-                                 language?
+                    * language   Language code, e.g. "de" or "en-US".
+                    * language_preference  Is this in the language mentioned in
+                                 the URL?
                                   10 if it's what the URL is about,
                                   -1 for default (don't know),
                                   -10 otherwise, other values reserved for now.
@@ -165,12 +166,14 @@ class InfoExtractor(object):
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
      average_rating: Average rating give by users, the scale used depends on the webpage
      comment_count:  Number of comments on the video
      comments:       A list of comments, each with one or more of the following
@@ -198,6 +201,26 @@ class InfoExtractor(object):
      end_time:       Time in seconds where the reproduction should end, as
                      specified in the URL.
  
+    The following fields should only be used when the video belongs to some logical
+    chapter or section:
+
+    chapter:        Name or title of the chapter the video belongs to.
+    chapter_number: Number of the chapter the video belongs to, as an integer.
+    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
+
+    The following fields should only be used when the video is an episode of some
+    series or programme:
+
+    series:         Title of the series or programme the video episode belongs to.
+    season:         Title of the season the video episode belongs to.
+    season_number:  Number of the season the video episode belongs to, as an integer.
+    season_id:      Id of the season the video episode belongs to, as a unicode string.
+    episode:        Title of the video episode. Unlike mandatory video title field,
+                    this field should denote the exact title of the video episode
+                    without any kind of decoration.
+    episode_number: Number of the video episode within a season, as an integer.
+    episode_id:     Id of the video episode, as a unicode string.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -290,9 +313,9 @@ class InfoExtractor(object):
          except ExtractorError:
              raise
          except compat_http_client.IncompleteRead as e:
-            raise ExtractorError('A network error has occured.', cause=e, expected=True)
+            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
          except (KeyError, StopIteration) as e:
-            raise ExtractorError('An extractor error has occured.', cause=e)
+            raise ExtractorError('An extractor error has occurred.', cause=e)
  
      def set_downloader(self, downloader):
          """Sets the downloader for this IE."""
@@ -309,11 +332,11 @@ class InfoExtractor(object):
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
-        return cls.__name__[:-2]
+        return compat_str(cls.__name__[:-2])
  
      @property
      def IE_NAME(self):
-        return type(self).__name__[:-2]
+        return compat_str(type(self).__name__[:-2])
  
      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns the response handle """
@@ -331,7 +354,8 @@ class InfoExtractor(object):
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
-            errmsg = '%s: %s' % (errnote, compat_str(err))
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
@@ -460,7 +484,7 @@ class InfoExtractor(object):
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
@@ -621,7 +645,7 @@ class InfoExtractor(object):
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
  
          return (username, password)
  
@@ -644,8 +668,9 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
-        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
@@ -774,14 +799,12 @@ class InfoExtractor(object):
  
              preference = f.get('preference')
              if preference is None:
-                proto = f.get('protocol')
-                if proto is None:
-                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
-                preference = 0 if proto in ['http', 'https'] else -0.1
+                preference = 0
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
+            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -812,6 +835,7 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
@@ -839,7 +863,7 @@ class InfoExtractor(object):
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError):
+            if isinstance(e.cause, compat_urllib_error.URLError):
                  self.to_screen(
                      '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
@@ -870,13 +894,18 @@ class InfoExtractor(object):
          time.sleep(timeout)
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
-                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
-            transform_source=transform_source)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return []
  
          formats = []
          manifest_version = '1.0'
@@ -884,6 +913,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -891,13 +925,14 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -933,13 +968,15 @@ class InfoExtractor(object):
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
-        m3u8_doc = self._download_webpage(
+        res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
-        if m3u8_doc is False:
-            return m3u8_doc
+        if res is False:
+            return []
+        m3u8_doc, urlh = res
+        m3u8_url = urlh.geturl()
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -1130,7 +1167,7 @@ class InfoExtractor(object):
  
              if proto == 'm3u8' or src_ext == 'm3u8':
                  formats.extend(self._extract_m3u8_formats(
-                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
                  continue
  
              if src_ext == 'f4m':
@@ -1142,10 +1179,10 @@ class InfoExtractor(object):
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
                  continue
  
-            if src_url.startswith('http'):
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
                  http_count += 1
                  formats.append({
                      'url': src_url,
@@ -1264,7 +1301,7 @@ class InfoExtractor(object):
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
-        req = compat_urllib_request.Request(url)
+        req = sanitized_Request(url)
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))