Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index d694e818e98dc29939118a6d9fccb8b942b03128..52523d7b249210eed88d334c2fefe28cabe78706 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,13 +10,11 @@ import re
  import socket
  import sys
  import time
-import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
      compat_getpass,
-    compat_HTTPError,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
@@ -24,6 +22,7 @@ from ..compat import (
      compat_urllib_request,
      compat_urlparse,
      compat_str,
+    compat_etree_fromstring,
  )
  from ..utils import (
      NO_DEFAULT,
@@ -39,6 +38,7 @@ from ..utils import (
      RegexNotFoundError,
      sanitize_filename,
      unescapeHTML,
+    unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
@@ -152,6 +152,7 @@ class InfoExtractor(object):
      description:    Full video description.
      uploader:       Full name of the video uploader.
      creator:        The main artist who created the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
                      If not explicitly set, calculated from timestamp.
@@ -163,12 +164,14 @@ class InfoExtractor(object):
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
      duration:       Length of the video in seconds, as an integer.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
      average_rating: Average rating give by users, the scale used depends on the webpage
      comment_count:  Number of comments on the video
      comments:       A list of comments, each with one or more of the following
@@ -458,7 +461,7 @@ class InfoExtractor(object):
              return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note='Downloading JSON metadata',
@@ -516,6 +519,12 @@ class InfoExtractor(object):
              '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
              expected=True)
  
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+        raise ExtractorError(
+            '%s. You might want to use --proxy to workaround.' % msg,
+            expected=True)
+
      # Methods for following #608
      @staticmethod
      def url_result(url, ie=None, video_id=None, video_title=None):
@@ -636,8 +645,9 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
-        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
              template % (property_re, content_re),
@@ -831,7 +841,7 @@ class InfoExtractor(object):
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError):
+            if isinstance(e.cause, compat_urllib_error.URLError):
                  self.to_screen(
                      '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
@@ -862,13 +872,18 @@ class InfoExtractor(object):
          time.sleep(timeout)
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
-                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
-            transform_source=transform_source)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return manifest
  
          formats = []
          manifest_version = '1.0'
@@ -889,7 +904,10 @@ class InfoExtractor(object):
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    f4m_formats = self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
+                    if f4m_formats:
+                        formats.extend(f4m_formats)
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -925,13 +943,14 @@ class InfoExtractor(object):
              if re.match(r'^https?://', u)
              else compat_urlparse.urljoin(m3u8_url, u))
  
-        m3u8_doc = self._download_webpage(
+        m3u8_doc, urlh = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
          if m3u8_doc is False:
              return m3u8_doc
+        m3u8_url = urlh.geturl()
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -1037,6 +1056,7 @@ class InfoExtractor(object):
          video_id = os.path.splitext(url_basename(smil_url))[0]
          title = None
          description = None
+        upload_date = None
          for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
              name = meta.attrib.get('name')
              content = meta.attrib.get('content')
@@ -1046,11 +1066,22 @@ class InfoExtractor(object):
                  title = content
              elif not description and name in ('description', 'abstract'):
                  description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
  
          return {
              'id': video_id,
              'title': title or video_id,
              'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
              'formats': formats,
              'subtitles': subtitles,
          }
@@ -1077,7 +1108,7 @@ class InfoExtractor(object):
              if not src:
                  continue
  
-            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
              filesize = int_or_none(video.get('size') or video.get('fileSize'))
              width = int_or_none(video.get('width'))
              height = int_or_none(video.get('height'))
@@ -1109,8 +1140,10 @@ class InfoExtractor(object):
              src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
  
              if proto == 'm3u8' or src_ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                m3u8_formats = self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
                  continue
  
              if src_ext == 'f4m':
@@ -1122,10 +1155,12 @@ class InfoExtractor(object):
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
                  continue
  
-            if src_url.startswith('http'):
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
                  http_count += 1
                  formats.append({
                      'url': src_url,