Merge branch 'cliphunter' of https://github.com/pornophage/youtube-dl
[youtube-dl] / youtube_dl / extractor / generic.py
index 7d0e117de99eb2ff29992321a63e03dc820ad953..829e5894fafc5f2f8a54c033d211808485cad4af 100644 (file)
@@ -92,11 +92,12 @@ class GenericIE(InfoExtractor):
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+            'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
             'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
             'info_dict': {
                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
                 'ext': 'mp4',
-                'title': '2cc213299525360.mov', #that's what we get
+                'title': '2cc213299525360.mov',  # that's what we get
             },
         },
     ]
@@ -161,8 +162,19 @@ class GenericIE(InfoExtractor):
     def _real_extract(self, url):
         parsed_url = compat_urlparse.urlparse(url)
         if not parsed_url.scheme:
-            self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
-            return self.url_result('http://' + url)
+            default_search = self._downloader.params.get('default_search')
+            if default_search is None:
+                default_search = 'auto'
+
+            if default_search == 'auto':
+                if '/' in url:
+                    self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+                    return self.url_result('http://' + url)
+                else:
+                    return self.url_result('ytsearch:' + url)
+            else:
+                assert ':' in default_search
+                return self.url_result(default_search + url)
         video_id = os.path.splitext(url.split('/')[-1])[0]
 
         self.to_screen('%s: Requesting header' % video_id)
@@ -230,7 +242,7 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded (iframe) Vimeo player
         mobj = re.search(
-            r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage)
+            r'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage)
         if mobj:
             player_url = unescapeHTML(mobj.group(1))
             surl = smuggle_url(player_url, {'Referer': url})
@@ -303,10 +315,33 @@ class GenericIE(InfoExtractor):
             return OoyalaIE._build_url_result(mobj.group(1))
 
         # Look for Aparat videos
-        mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage)
+        mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
         if mobj is not None:
             return self.url_result(mobj.group(1), 'Aparat')
 
+        # Look for MPORA videos
+        mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group(1), 'Mpora')
+
+        # Look for embedded Novamov player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Novamov')
+
+        # Look for embedded Facebook player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Facebook')
+
+        # Look for embedded Huffington Post player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'HuffPost')
+
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
@@ -317,7 +352,7 @@ class GenericIE(InfoExtractor):
             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
         if mobj is None:
             # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
+            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
         if mobj is None:
             # Try to find twitter cards info
             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)