[youtube] Always request webpage in English (Fixes #3844)

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 8b11f7f7a086cc28828a8e0afe7ec520d13ed956..0dfa4853dbd3b1cbacaf2b9532dcdeffc8a9300c 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -8,12 +8,11 @@ import re
  from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..utils import (
-    compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_request,
      compat_urlparse,
      compat_xml_parse_error,
  
+    determine_ext,
      ExtractorError,
      float_or_none,
      HEADRequest,
@@ -343,15 +342,61 @@ class GenericIE(InfoExtractor):
                  'uploader': 'www.handjobhub.com',
                  'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
              }
-        }
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:'
+            },
+            'playlist_mincount': 11,
+        },
+        # Multiple brightcove videos
+        # https://github.com/rg3/youtube-dl/issues/2283
+        {
+            'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
+            'info_dict': {
+                'id': 'always-never',
+                'title': 'Always / Never - The New Yorker',
+            },
+            'playlist_count': 3,
+            'params': {
+                'extract_flat': False,
+                'skip_download': True,
+            }
+        },
+        # MLB embed
+        {
+            'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
+            'md5': '96f09a37e44da40dd083e12d9a683327',
+            'info_dict': {
+                'id': '33322633',
+                'ext': 'mp4',
+                'title': 'Ump changes call to ball',
+                'description': 'md5:71c11215384298a172a6dcb4c2e20685',
+                'duration': 48,
+                'timestamp': 1401537900,
+                'upload_date': '20140531',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
+        # Wistia embed
+        {
+            'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+            'md5': '8788b683c777a5cf25621eaf286d0c23',
+            'info_dict': {
+                'id': '1cfaf6b7ea',
+                'ext': 'mov',
+                'title': 'md5:51364a8d3d009997ba99656004b5e20d',
+                'duration': 643.0,
+                'filesize': 182808282,
+                'uploader': 'education-portal.com',
+            },
+        },
      ]
  
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        if not self._downloader.params.get('test', False):
-            self._downloader.report_warning('Falling back on generic information extractor.')
-        super(GenericIE, self).report_download_webpage(video_id)
-
      def report_following_redirect(self, new_url):
          """Report information extraction."""
          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
@@ -451,6 +496,7 @@ class GenericIE(InfoExtractor):
  
          url, smuggled_data = unsmuggle_url(url)
          force_videoid = None
+        is_intentional = smuggled_data and smuggled_data.get('to_generic')
          if smuggled_data and 'force_videoid' in smuggled_data:
              force_videoid = smuggled_data['force_videoid']
              video_id = force_videoid
@@ -493,6 +539,9 @@ class GenericIE(InfoExtractor):
                      'upload_date': upload_date,
                  }
  
+        if not self._downloader.params.get('test', False) and not is_intentional:
+            self._downloader.report_warning('Falling back on generic information extractor.')
+
          try:
              webpage = self._download_webpage(url, video_id)
          except ValueError:
@@ -546,7 +595,9 @@ class GenericIE(InfoExtractor):
  
          # Helper method
          def _playlist_from_matches(matches, getter, ie=None):
-            urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches)
+            urlrs = orderedSet(
+                self.url_result(self._proto_relative_url(getter(m)), ie)
+                for m in matches)
              return self.playlist_result(
                  urlrs, playlist_id=video_id, playlist_title=video_title)
  
@@ -590,12 +641,12 @@ class GenericIE(InfoExtractor):
                  embedSWF\(?:\s*
              )
              (["\'])
-                (?P<url>(?:https?:)?//(?:www\.)?youtube\.com/
-                (?:embed|v)/.+?)
+                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+                (?:embed|v|p)/.+?)
              \1''', webpage)
          if matches:
              return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]), ie='Youtube')
+                matches, lambda m: unescapeHTML(m[1]))
  
          # Look for embedded Dailymotion player
          matches = re.findall(
@@ -616,6 +667,16 @@ class GenericIE(InfoExtractor):
                  'title': video_title,
                  'id': video_id,
              }
+        match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+        if match:
+            return {
+                '_type': 'url_transparent',
+                'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+                'ie_key': 'Wistia',
+                'uploader': video_uploader,
+                'title': video_title,
+                'id': match.group('id')
+            }
  
          # Look for embedded blip.tv player
          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
@@ -786,6 +847,12 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'SBS')
  
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'MLB')
+
          # Start with something easy: JW Player in SWFObject
          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if not found:
@@ -811,7 +878,6 @@ class GenericIE(InfoExtractor):
                      \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
                          ["']?url["']?\s*:\s*["']([^"']+)["']
              ''', webpage)
-            assert found
          if not found:
              # Try to find twitter cards info
              found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -823,17 +889,18 @@ class GenericIE(InfoExtractor):
              if m_video_type is not None:
                  def check_video(vurl):
                      vpath = compat_urlparse.urlparse(vurl).path
-                    return '.' in vpath and not vpath.endswith('.swf')
+                    vext = determine_ext(vpath)
+                    return '.' in vpath and vext not in ('swf', 'png', 'jpg')
                  found = list(filter(
                      check_video,
                      re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
          if not found:
              # HTML5 video
-            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
          if not found:
              found = re.search(
                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
-                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
                  webpage)
              if found:
                  new_url = found.group(1)