Merge branch 'vgtv' of https://github.com/mrkolby/youtube-dl into mrkolby-vgtv
[youtube-dl] / youtube_dl / extractor / generic.py
index 44f7ea3fd89b3753052cb152ab3c6750ad1b4b18..1b7697870bde93fd5bb4218d12eac8c166a306ba 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     compat_urlparse,
     compat_xml_parse_error,
 
+    determine_ext,
     ExtractorError,
     float_or_none,
     HEADRequest,
@@ -351,7 +352,36 @@ class GenericIE(InfoExtractor):
                 'description': 're:'
             },
             'playlist_mincount': 11,
-        }
+        },
+        # Multiple brightcove videos
+        # https://github.com/rg3/youtube-dl/issues/2283
+        {
+            'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
+            'info_dict': {
+                'id': 'always-never',
+                'title': 'Always / Never - The New Yorker',
+            },
+            'playlist_count': 3,
+            'params': {
+                'extract_flat': False,
+                'skip_download': True,
+            }
+        },
+        # MLB embed
+        {
+            'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
+            'md5': '96f09a37e44da40dd083e12d9a683327',
+            'info_dict': {
+                'id': '33322633',
+                'ext': 'mp4',
+                'title': 'Ump changes call to ball',
+                'description': 'md5:71c11215384298a172a6dcb4c2e20685',
+                'duration': 48,
+                'timestamp': 1401537900,
+                'upload_date': '20140531',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
     ]
 
     def report_download_webpage(self, video_id):
@@ -794,6 +824,12 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'SBS')
 
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'MLB')
+
         # Start with something easy: JW Player in SWFObject
         found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if not found:
@@ -819,7 +855,6 @@ class GenericIE(InfoExtractor):
                     \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
                         ["']?url["']?\s*:\s*["']([^"']+)["']
             ''', webpage)
-            assert found
         if not found:
             # Try to find twitter cards info
             found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -831,13 +866,14 @@ class GenericIE(InfoExtractor):
             if m_video_type is not None:
                 def check_video(vurl):
                     vpath = compat_urlparse.urlparse(vurl).path
-                    return '.' in vpath and not vpath.endswith('.swf')
+                    vext = determine_ext(vpath)
+                    return '.' in vpath and vext not in ('swf', 'png', 'jpg')
                 found = list(filter(
                     check_video,
                     re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
         if not found:
             # HTML5 video
-            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
         if not found:
             found = re.search(
                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'