[sunporno] Extract all formats and metadata

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 8b11f7f7a086cc28828a8e0afe7ec520d13ed956..b4f748e87fbc08ab62d62dc1b17b5da817afa5b3 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -8,12 +8,11 @@ import re
  from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..utils import (
-    compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_request,
      compat_urlparse,
      compat_xml_parse_error,
  
+    determine_ext,
      ExtractorError,
      float_or_none,
      HEADRequest,
@@ -343,6 +342,30 @@ class GenericIE(InfoExtractor):
                  'uploader': 'www.handjobhub.com',
                  'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
              }
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:'
+            },
+            'playlist_mincount': 11,
+        },
+        # Multiple brightcove videos
+        # https://github.com/rg3/youtube-dl/issues/2283
+        {
+            'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
+            'info_dict': {
+                'id': 'always-never',
+                'title': 'Always / Never - The New Yorker',
+            },
+            'playlist_count': 3,
+            'params': {
+                'extract_flat': False,
+                'skip_download': True,
+            }
          }
      ]
  
@@ -811,7 +834,6 @@ class GenericIE(InfoExtractor):
                      \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
                          ["']?url["']?\s*:\s*["']([^"']+)["']
              ''', webpage)
-            assert found
          if not found:
              # Try to find twitter cards info
              found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -823,13 +845,14 @@ class GenericIE(InfoExtractor):
              if m_video_type is not None:
                  def check_video(vurl):
                      vpath = compat_urlparse.urlparse(vurl).path
-                    return '.' in vpath and not vpath.endswith('.swf')
+                    vext = determine_ext(vpath)
+                    return '.' in vpath and vext not in ('swf', 'png', 'jpg')
                  found = list(filter(
                      check_video,
                      re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
          if not found:
              # HTML5 video
-            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
          if not found:
              found = re.search(
                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'