Fix typos

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index d694e818e98dc29939118a6d9fccb8b942b03128..b05b22a94b0bac346e776ef5c3d42305cf5f4163 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,20 +10,17 @@ import re
  import socket
  import sys
  import time
  import socket
  import sys
  import time
-import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
      compat_getpass,
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
      compat_getpass,
-    compat_HTTPError,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
      compat_urlparse,
      compat_str,
      compat_urlparse,
      compat_str,
+    compat_etree_fromstring,
  )
  from ..utils import (
      NO_DEFAULT,
  )
  from ..utils import (
      NO_DEFAULT,
@@ -32,16 +29,20 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
      clean_html,
      compiled_regex_type,
      determine_ext,
+    error_to_compat_str,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
+    sanitized_Request,
      unescapeHTML,
      unescapeHTML,
+    unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
      url_basename,
      xpath_text,
      xpath_with_ns,
+    determine_protocol,
  )
  
  
  )
  
  
@@ -107,8 +108,9 @@ class InfoExtractor(object):
                                   -2 or smaller for less than default.
                                   < -1000 to hide the format (if there is
                                      another one which is strictly better)
                                   -2 or smaller for less than default.
                                   < -1000 to hide the format (if there is
                                      another one which is strictly better)
-                    * language_preference  Is this in the correct requested
-                                 language?
+                    * language   Language code, e.g. "de" or "en-US".
+                    * language_preference  Is this in the language mentioned in
+                                 the URL?
                                   10 if it's what the URL is about,
                                   -1 for default (don't know),
                                   -10 otherwise, other values reserved for now.
                                   10 if it's what the URL is about,
                                   -1 for default (don't know),
                                   -10 otherwise, other values reserved for now.
@@ -152,6 +154,7 @@ class InfoExtractor(object):
      description:    Full video description.
      uploader:       Full name of the video uploader.
      creator:        The main artist who created the video.
      description:    Full video description.
      uploader:       Full name of the video uploader.
      creator:        The main artist who created the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
                      If not explicitly set, calculated from timestamp.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
                      If not explicitly set, calculated from timestamp.
@@ -163,12 +166,14 @@ class InfoExtractor(object):
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
      average_rating: Average rating give by users, the scale used depends on the webpage
      comment_count:  Number of comments on the video
      comments:       A list of comments, each with one or more of the following
      average_rating: Average rating give by users, the scale used depends on the webpage
      comment_count:  Number of comments on the video
      comments:       A list of comments, each with one or more of the following
@@ -196,6 +201,26 @@ class InfoExtractor(object):
      end_time:       Time in seconds where the reproduction should end, as
                      specified in the URL.
  
      end_time:       Time in seconds where the reproduction should end, as
                      specified in the URL.
  
+    The following fields should only be used when the video belongs to some logical
+    chapter or section:
+
+    chapter:        Name or title of the chapter the video belongs to.
+    chapter_number: Number of the chapter the video belongs to, as an integer.
+    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
+
+    The following fields should only be used when the video is an episode of some
+    series or programme:
+
+    series:         Title of the series or programme the video episode belongs to.
+    season:         Title of the season the video episode belongs to.
+    season_number:  Number of the season the video episode belongs to, as an integer.
+    season_id:      Id of the season the video episode belongs to, as a unicode string.
+    episode:        Title of the video episode. Unlike mandatory video title field,
+                    this field should denote the exact title of the video episode
+                    without any kind of decoration.
+    episode_number: Number of the video episode within a season, as an integer.
+    episode_id:     Id of the video episode, as a unicode string.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -288,9 +313,9 @@ class InfoExtractor(object):
          except ExtractorError:
              raise
          except compat_http_client.IncompleteRead as e:
          except ExtractorError:
              raise
          except compat_http_client.IncompleteRead as e:
-            raise ExtractorError('A network error has occured.', cause=e, expected=True)
+            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
          except (KeyError, StopIteration) as e:
          except (KeyError, StopIteration) as e:
-            raise ExtractorError('An extractor error has occured.', cause=e)
+            raise ExtractorError('An extractor error has occurred.', cause=e)
  
      def set_downloader(self, downloader):
          """Sets the downloader for this IE."""
  
      def set_downloader(self, downloader):
          """Sets the downloader for this IE."""
@@ -307,11 +332,11 @@ class InfoExtractor(object):
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
-        return cls.__name__[:-2]
+        return compat_str(cls.__name__[:-2])
  
      @property
      def IE_NAME(self):
  
      @property
      def IE_NAME(self):
-        return type(self).__name__[:-2]
+        return compat_str(type(self).__name__[:-2])
  
      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns the response handle """
  
      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
          """ Returns the response handle """
@@ -329,7 +354,8 @@ class InfoExtractor(object):
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
-            errmsg = '%s: %s' % (errnote, compat_str(err))
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
@@ -458,7 +484,7 @@ class InfoExtractor(object):
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
@@ -516,6 +542,12 @@ class InfoExtractor(object):
              '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
              expected=True)
  
              '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
              expected=True)
  
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+        raise ExtractorError(
+            '%s. You might want to use --proxy to workaround.' % msg,
+            expected=True)
+
      # Methods for following #608
      @staticmethod
      def url_result(url, ie=None, video_id=None, video_title=None):
      # Methods for following #608
      @staticmethod
      def url_result(url, ie=None, video_id=None, video_title=None):
@@ -613,7 +645,7 @@ class InfoExtractor(object):
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
  
          return (username, password)
  
  
          return (username, password)
  
@@ -636,8 +668,9 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
-        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
@@ -766,14 +799,12 @@ class InfoExtractor(object):
  
              preference = f.get('preference')
              if preference is None:
  
              preference = f.get('preference')
              if preference is None:
-                proto = f.get('protocol')
-                if proto is None:
-                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
-                preference = 0 if proto in ['http', 'https'] else -0.1
+                preference = 0
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
+            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -804,6 +835,7 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
@@ -831,7 +863,7 @@ class InfoExtractor(object):
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError):
+            if isinstance(e.cause, compat_urllib_error.URLError):
                  self.to_screen(
                      '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
                  self.to_screen(
                      '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
@@ -862,13 +894,18 @@ class InfoExtractor(object):
          time.sleep(timeout)
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
          time.sleep(timeout)
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
-                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
-            transform_source=transform_source)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return []
  
          formats = []
          manifest_version = '1.0'
  
          formats = []
          manifest_version = '1.0'
@@ -876,6 +913,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -883,13 +925,14 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -925,13 +968,15 @@ class InfoExtractor(object):
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
-        m3u8_doc = self._download_webpage(
+        res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
-        if m3u8_doc is False:
-            return m3u8_doc
+        if res is False:
+            return []
+        m3u8_doc, urlh = res
+        m3u8_url = urlh.geturl()
          last_info = None
          last_media = None
          kv_rex = re.compile(
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -1037,6 +1082,7 @@ class InfoExtractor(object):
          video_id = os.path.splitext(url_basename(smil_url))[0]
          title = None
          description = None
          video_id = os.path.splitext(url_basename(smil_url))[0]
          title = None
          description = None
+        upload_date = None
          for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
              name = meta.attrib.get('name')
              content = meta.attrib.get('content')
          for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
              name = meta.attrib.get('name')
              content = meta.attrib.get('content')
@@ -1046,11 +1092,22 @@ class InfoExtractor(object):
                  title = content
              elif not description and name in ('description', 'abstract'):
                  description = content
                  title = content
              elif not description and name in ('description', 'abstract'):
                  description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
  
          return {
              'id': video_id,
              'title': title or video_id,
              'description': description,
  
          return {
              'id': video_id,
              'title': title or video_id,
              'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
              'formats': formats,
              'subtitles': subtitles,
          }
              'formats': formats,
              'subtitles': subtitles,
          }
@@ -1077,7 +1134,7 @@ class InfoExtractor(object):
              if not src:
                  continue
  
              if not src:
                  continue
  
-            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
              filesize = int_or_none(video.get('size') or video.get('fileSize'))
              width = int_or_none(video.get('width'))
              height = int_or_none(video.get('height'))
              filesize = int_or_none(video.get('size') or video.get('fileSize'))
              width = int_or_none(video.get('width'))
              height = int_or_none(video.get('height'))
@@ -1110,7 +1167,7 @@ class InfoExtractor(object):
  
              if proto == 'm3u8' or src_ext == 'm3u8':
                  formats.extend(self._extract_m3u8_formats(
  
              if proto == 'm3u8' or src_ext == 'm3u8':
                  formats.extend(self._extract_m3u8_formats(
-                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
                  continue
  
              if src_ext == 'f4m':
                  continue
  
              if src_ext == 'f4m':
@@ -1122,10 +1179,10 @@ class InfoExtractor(object):
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
                  continue
  
                  continue
  
-            if src_url.startswith('http'):
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
                  http_count += 1
                  formats.append({
                      'url': src_url,
                  http_count += 1
                  formats.append({
                      'url': src_url,
@@ -1244,7 +1301,7 @@ class InfoExtractor(object):
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
-        req = compat_urllib_request.Request(url)
+        req = sanitized_Request(url)
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))
  
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))