[crunchyroll] Fix title extraction (Closes #7396)

[youtube-dl] / youtube_dl / extractor / crunchyroll.py
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py

index cecd0c7843b9b42881637ad0617e5e351eb8f97e..4243f3e2e24f8c4080b914a45dff4dede3340280 100644 (file)
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -5,12 +5,12 @@ import re
  import json
  import base64
  import zlib
-import xml.etree.ElementTree
  
  from hashlib import sha1
  from math import pow, sqrt, floor
  from .common import InfoExtractor
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_parse,
      compat_urllib_parse_unquote,
      compat_urllib_request,
@@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
          return output
  
      def _extract_subtitles(self, subtitle):
-        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        sub_root = compat_etree_fromstring(subtitle)
          return [{
              'ext': 'srt',
              'data': self._convert_subtitles_to_srt(sub_root),
@@ -245,7 +245,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
  
      def _get_subtitles(self, video_id, webpage):
          subtitles = {}
-        for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
+        for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
              sub_page = self._download_webpage(
                  'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
                  video_id, note='Downloading subtitles for ' + sub_name)
@@ -287,7 +287,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
          if 'To view this, please log in to verify you are 18 or older.' in webpage:
              self.raise_login_required()
  
-        video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
+        video_title = self._html_search_regex(
+            r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
+            webpage, 'video_title')
          video_title = re.sub(r' {2,}', ' ', video_title)
          video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
          if not video_description: