[extractor/generic] Add test for large compressed media

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 9a7b0d25d790054e39729bab63e42b1ea7a89dff..737141f954b4acf9fcacc9e4f946983f757888ca 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -9,6 +9,8 @@ from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
      compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
      compat_urlparse,
      compat_xml_parse_error,
  )
@@ -136,6 +138,20 @@ class GenericIE(InfoExtractor):
                  'upload_date': '20100513',
              }
          },
+        # Direct link to a media delivered compressed (requires Accept-Encoding == *)
+        {
+            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+            'md5': '128c42e68b13950268b648275386fc74',
+            'info_dict': {
+                'id': 'FictionJunction-Parallel_Hearts',
+                'ext': 'flac',
+                'title': 'FictionJunction-Parallel_Hearts',
+                'upload_date': '20140522',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
          # ooyala video
          {
              'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -894,7 +910,7 @@ class GenericIE(InfoExtractor):
              force_videoid = smuggled_data['force_videoid']
              video_id = force_videoid
          else:
-            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+            video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
  
          self.to_screen('%s: Requesting header' % video_id)
  
@@ -916,7 +932,9 @@ class GenericIE(InfoExtractor):
  
          full_response = None
          if head_response is False:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
              head_response = full_response
  
          # Check for direct link to a video
@@ -927,7 +945,7 @@ class GenericIE(InfoExtractor):
                  head_response.headers.get('Last-Modified'))
              return {
                  'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                  'direct': True,
                  'formats': [{
                      'format_id': m.group('format_id'),
@@ -941,7 +959,17 @@ class GenericIE(InfoExtractor):
              self._downloader.report_warning('Falling back on generic information extractor.')
  
          if not full_response:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
  
          # Maybe it's a direct link to a video?
          # Be careful not to download the whole thing!
@@ -953,7 +981,7 @@ class GenericIE(InfoExtractor):
                  head_response.headers.get('Last-Modified'))
              return {
                  'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                  'direct': True,
                  'url': url,
                  'upload_date': upload_date,