[extractor/generic] Force Accept-Encoding to any for extraction pass

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 046bcb0f040531d55f86a2dbe72e541f8a24d039..ec1d9abbe4c9972315206e7f92f569afeb88a182 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -11,6 +11,7 @@ from ..compat import (
      compat_urllib_parse,
      compat_urlparse,
      compat_xml_parse_error,
+    compat_urllib_request,
  )
  from ..utils import (
      determine_ext,
@@ -32,6 +33,7 @@ from .brightcove import BrightcoveIE
  from .nbc import NBCSportsVPlayerIE
  from .ooyala import OoyalaIE
  from .rutv import RUTVIE
+from .sportbox import SportBoxEmbedIE
  from .smotri import SmotriIE
  from .condenast import CondeNastIE
  from .udn import UDNEmbedIE
@@ -224,6 +226,37 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        # SportBox embed
+        {
+            'url': 'http://www.vestifinance.ru/articles/25753',
+            'info_dict': {
+                'id': '25753',
+                'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': '370908',
+                    'title': 'Госзаказ. День 3',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370905',
+                    'title': 'Госзаказ. День 2',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370902',
+                    'title': 'Госзаказ. День 1',
+                    'ext': 'mp4',
+                }
+            }],
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
          # Embedded TED video
          {
              'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -414,19 +447,6 @@ class GenericIE(InfoExtractor):
                  'thumbnail': 're:^https?://.*\.jpg$',
              },
          },
-        # MLB articles
-        {
-            'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
-            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
-            'info_dict': {
-                'id': '75609783',
-                'ext': 'mp4',
-                'title': 'Must C: Pillar climbs for catch',
-                'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
-                'timestamp': 1429124820,
-                'upload_date': '20150415',
-            }
-        },
          # Wistia embed
          {
              'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
@@ -659,6 +679,17 @@ class GenericIE(InfoExtractor):
                  'title': 'Facebook Creates "On This Day" | Crunch Report',
              },
          },
+        # SVT embed
+        {
+            'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
+            'info_dict': {
+                'id': '2900353',
+                'ext': 'flv',
+                'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+                'duration': 27,
+                'age_limit': 0,
+            },
+        },
          # RSS feed with enclosure
          {
              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
@@ -864,7 +895,7 @@ class GenericIE(InfoExtractor):
              force_videoid = smuggled_data['force_videoid']
              video_id = force_videoid
          else:
-            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+            video_id = os.path.splitext(compat_urllib_parse.unquote(url.rstrip('/').split('/')[-1]))[0]
  
          self.to_screen('%s: Requesting header' % video_id)
  
@@ -886,7 +917,9 @@ class GenericIE(InfoExtractor):
  
          full_response = None
          if head_response is False:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
              head_response = full_response
  
          # Check for direct link to a video
@@ -897,7 +930,7 @@ class GenericIE(InfoExtractor):
                  head_response.headers.get('Last-Modified'))
              return {
                  'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0],
                  'direct': True,
                  'formats': [{
                      'format_id': m.group('format_id'),
@@ -911,7 +944,17 @@ class GenericIE(InfoExtractor):
              self._downloader.report_warning('Falling back on generic information extractor.')
  
          if not full_response:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
  
          # Maybe it's a direct link to a video?
          # Be careful not to download the whole thing!
@@ -923,7 +966,7 @@ class GenericIE(InfoExtractor):
                  head_response.headers.get('Last-Modified'))
              return {
                  'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0],
                  'direct': True,
                  'url': url,
                  'upload_date': upload_date,
@@ -1231,6 +1274,11 @@ class GenericIE(InfoExtractor):
          if rutv_url:
              return self.url_result(rutv_url, 'RUTV')
  
+        # Look for embedded SportBox player
+        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+        if sportbox_urls:
+            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
          # Look for embedded TED player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1390,7 +1438,7 @@ class GenericIE(InfoExtractor):
          # Look for Senate ISVP iframe
          senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
          if senate_isvp_url:
-            return self.url_result(surl, 'SenateISVP')
+            return self.url_result(senate_isvp_url, 'SenateISVP')
  
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):