[redtube] Improve formats extraction and extract m3u8 formats (closes #25311, closes...

[youtube-dl] / youtube_dl / extractor / redtube.py
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py

index 5c84028ef97e8220d494633f5ada42804fa2ae7f..2d2f6a98c97dba8605cb9f640c7c73d860caa1d0 100644 (file)
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -4,6 +4,7 @@ import re
  
  from .common import InfoExtractor
  from ..utils import (
+    determine_ext,
      ExtractorError,
      int_or_none,
      merge_dicts,
@@ -43,14 +44,21 @@ class RedTubeIE(InfoExtractor):
          webpage = self._download_webpage(
              'http://www.redtube.com/%s' % video_id, video_id)
  
-        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
-            raise ExtractorError('Video %s has been removed' % video_id, expected=True)
+        ERRORS = (
+            (('video-deleted-info', '>This video has been removed'), 'has been removed'),
+            (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'),
+        )
+
+        for patterns, message in ERRORS:
+            if any(p in webpage for p in patterns):
+                raise ExtractorError(
+                    'Video %s %s' % (video_id, message), expected=True)
  
          info = self._search_json_ld(webpage, video_id, default={})
  
          if not info.get('title'):
              info['title'] = self._html_search_regex(
-                (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
+                (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
                   r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',),
                  webpage, 'title', group='title',
                  default=None) or self._og_search_title(webpage)
@@ -70,7 +78,7 @@ class RedTubeIE(InfoExtractor):
                      })
          medias = self._parse_json(
              self._search_regex(
-                r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
+                r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage,
                  'media definitions', default='{}'),
              video_id, fatal=False)
          if medias and isinstance(medias, list):
@@ -78,6 +86,12 @@ class RedTubeIE(InfoExtractor):
                  format_url = url_or_none(media.get('videoUrl'))
                  if not format_url:
                      continue
+                if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls',
+                        fatal=False))
+                    continue
                  format_id = media.get('quality')
                  formats.append({
                      'url': format_url,