[generic] Add a test case for direct links with broken HEAD (#4032)

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 158dc95a5a5380ff56414c2832308f3441cd2a66..52f286ac6efd35f33524bef31084ea181e55002d 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -380,6 +380,32 @@ class GenericIE(InfoExtractor):
                  'uploader': 'education-portal.com',
              },
          },
+        {
+            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
+            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
+            'info_dict': {
+                'id': 'uxjb0lwrcz',
+                'ext': 'mp4',
+                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+                'duration': 1715.0,
+                'uploader': 'thoughtworks.wistia.com',
+            },
+        },
+        # Direct download with broken HEAD
+        {
+            'url': 'http://ai-radio.org:8000/radio.opus',
+            'info_dict': {
+                'id': 'radio',
+                'ext': 'opus',
+                'title': 'radio',
+            },
+            'params': {
+                'skip_download': True,  # infinite live stream
+            },
+            'expected_warnings': [
+                r'501.*Not Implemented'
+            ],
+        }
      ]
  
      def report_following_redirect(self, new_url):
@@ -476,7 +502,8 @@ class GenericIE(InfoExtractor):
                       'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
                      ) % (url, url), expected=True)
              else:
-                assert ':' in default_search
+                if ':' not in default_search:
+                    default_search += ':'
                  return self.url_result(default_search + url)
  
          url, smuggled_data = unsmuggle_url(url)
@@ -491,14 +518,14 @@ class GenericIE(InfoExtractor):
          self.to_screen('%s: Requesting header' % video_id)
  
          head_req = HEADRequest(url)
-        response = self._request_webpage(
+        head_response = self._request_webpage(
              head_req, video_id,
              note=False, errnote='Could not send HEAD request to %s' % url,
              fatal=False)
  
-        if response is not False:
+        if head_response is not False:
              # Check for redirect
-            new_url = response.geturl()
+            new_url = head_response.geturl()
              if url != new_url:
                  self.report_following_redirect(new_url)
                  if force_videoid:
@@ -506,34 +533,35 @@ class GenericIE(InfoExtractor):
                          new_url, {'force_videoid': force_videoid})
                  return self.url_result(new_url)
  
-            # Check for direct link to a video
-            content_type = response.headers.get('Content-Type', '')
-            m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
-            if m:
-                upload_date = response.headers.get('Last-Modified')
-                if upload_date:
-                    upload_date = unified_strdate(upload_date)
-                return {
-                    'id': video_id,
-                    'title': os.path.splitext(url_basename(url))[0],
-                    'formats': [{
-                        'format_id': m.group('format_id'),
-                        'url': url,
-                        'vcodec': 'none' if m.group('type') == 'audio' else None
-                    }],
-                    'upload_date': upload_date,
-                }
+        full_response = None
+        if head_response is False:
+            full_response = self._request_webpage(url, video_id)
+            head_response = full_response
+
+        # Check for direct link to a video
+        content_type = head_response.headers.get('Content-Type', '')
+        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        if m:
+            upload_date = unified_strdate(
+                head_response.headers.get('Last-Modified'))
+            return {
+                'id': video_id,
+                'title': os.path.splitext(url_basename(url))[0],
+                'formats': [{
+                    'format_id': m.group('format_id'),
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }],
+                'upload_date': upload_date,
+            }
  
          if not self._downloader.params.get('test', False) and not is_intentional:
              self._downloader.report_warning('Falling back on generic information extractor.')
  
-        try:
+        if full_response:
+            webpage = _webpage_read_content(url, video_id)
+        else:
              webpage = self._download_webpage(url, video_id)
-        except ValueError:
-            # since this is the last-resort InfoExtractor, if
-            # this error is thrown, it'll be thrown here
-            raise ExtractorError('Failed to download URL: %s' % url)
-
          self.report_extraction(video_id)
  
          # Is it an RSS feed?
@@ -623,7 +651,8 @@ class GenericIE(InfoExtractor):
                  <iframe[^>]+?src=|
                  data-video-url=|
                  <embed[^>]+?src=|
-                embedSWF\(?:\s*
+                embedSWF\(?:\s*|
+                new\s+SWFObject\(
              )
              (["\'])
                  (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
@@ -652,17 +681,20 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded Wistia player
          match = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
          if match:
+            embed_url = self._proto_relative_url(
+                unescapeHTML(match.group('url')))
              return {
                  '_type': 'url_transparent',
-                'url': unescapeHTML(match.group('url')),
+                'url': embed_url,
                  'ie_key': 'Wistia',
                  'uploader': video_uploader,
                  'title': video_title,
                  'id': video_id,
              }
-        match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+            
+        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
          if match:
              return {
                  '_type': 'url_transparent',