[generic] Allow multiple matches for generic hits (Fixes #2818)

author Philipp Hagemeister <phihag@phihag.de>

Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)

committer Philipp Hagemeister <phihag@phihag.de>

Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)
author Philipp Hagemeister <phihag@phihag.de>
Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)
committer Philipp Hagemeister <phihag@phihag.de>
Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index cfb009d7954c577526d4f086fbfbe48c2034388a..58092da38e44a642efb165a0dc62ee149412a847 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -637,70 +637,77 @@ class GenericIE(InfoExtractor):
              return self.url_result(smotri_url, 'Smotri')
  
          # Start with something easy: JW Player in SWFObject
              return self.url_result(smotri_url, 'Smotri')
  
          # Start with something easy: JW Player in SWFObject
-        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
-        if mobj is None:
+        found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+        if not found:
              # Look for gorilla-vid style embedding
              # Look for gorilla-vid style embedding
-            mobj = re.search(r'''(?sx)
+            found = re.findall(r'''(?sx)
                  (?:
                      jw_plugins|
                      JWPlayerOptions|
                      jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
                  )
                  .*?file\s*:\s*["\'](.*?)["\']''', webpage)
                  (?:
                      jw_plugins|
                      JWPlayerOptions|
                      jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
                  )
                  .*?file\s*:\s*["\'](.*?)["\']''', webpage)
-        if mobj is None:
+        if not found:
              # Broaden the search a little bit
              # Broaden the search a little bit
-            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
-        if mobj is None:
-            # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
-
-        if mobj is None:
+            found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+        if not found:
+            # Broaden the findall a little bit: JWPlayer JS loader
+            found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+        if not found:
              # Try to find twitter cards info
              # Try to find twitter cards info
-            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
-        if mobj is None:
+            found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+        if not found:
              # We look for Open Graph info:
              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
              # We look for Open Graph info:
              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
-            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
              if m_video_type is not None:
              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
              if m_video_type is not None:
-                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
-        if mobj is None:
+                found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+        if not found:
              # HTML5 video
              # HTML5 video
-            mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
-        if mobj is None:
-            mobj = re.search(
+            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+        if not found:
+            found = re.findall(
                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
                  r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
                  webpage)
                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
                  r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
                  webpage)
-            if mobj:
-                new_url = mobj.group(1)
+            if found:
+                new_url = found.group(1)
                  self.report_following_redirect(new_url)
                  return {
                      '_type': 'url',
                      'url': new_url,
                  }
                  self.report_following_redirect(new_url)
                  return {
                      '_type': 'url',
                      'url': new_url,
                  }
-        if mobj is None:
+        if not found:
              raise ExtractorError('Unsupported URL: %s' % url)
  
              raise ExtractorError('Unsupported URL: %s' % url)
  
-        # It's possible that one of the regexes
-        # matched, but returned an empty group:
-        if mobj.group(1) is None:
-            raise ExtractorError('Did not find a valid video URL at %s' % url)
+        entries = []
+        for video_url in found:
+            video_url = compat_urlparse.urljoin(url, video_url)
+            video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
  
  
-        video_url = mobj.group(1)
-        video_url = compat_urlparse.urljoin(url, video_url)
-        video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+            # Sometimes, jwplayer extraction will result in a YouTube URL
+            if YoutubeIE.suitable(video_url):
+                entries.append(self.url_result(video_url, 'Youtube'))
+                continue
  
  
-        # Sometimes, jwplayer extraction will result in a YouTube URL
-        if YoutubeIE.suitable(video_url):
-            return self.url_result(video_url, 'Youtube')
+            # here's a fun little line of code for you:
+            video_id = os.path.splitext(video_id)[0]
  
  
-        # here's a fun little line of code for you:
-        video_id = os.path.splitext(video_id)[0]
+            entries.append({
+                'id': video_id,
+                'url': video_url,
+                'uploader': video_uploader,
+                'title': video_title,
+            })
+
+        if len(entries) == 1:
+            return entries[1]
+        else:
+            for num, e in enumerate(entries, start=1):
+                e['title'] = '%s (%d)' % (e['title'], num)
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+            }
  
  
-        return {
-            'id': video_id,
-            'url': video_url,
-            'uploader': video_uploader,
-            'title': video_title,
-        }
author	Philipp Hagemeister <phihag@phihag.de>
	Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)
committer	Philipp Hagemeister <phihag@phihag.de>
	Wed, 30 Apr 2014 00:23:51 +0000 (02:23 +0200)