Improve URL extraction

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 76852f9dc82b046bf319d075ae7ca67c58b7d205..e5a8ffbe8d14897b410ea2b2078f4073aa5bed42 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -32,6 +32,7 @@ from ..utils import (
      unified_strdate,
      unsmuggle_url,
      UnsupportedError,
+    url_or_none,
      xpath_text,
  )
  from .commonprotocols import RtmpIE
@@ -108,6 +109,10 @@ from .yapfiles import YapFilesIE
  from .vice import ViceIE
  from .xfileshare import XFileShareIE
  from .cloudflarestream import CloudflareStreamIE
+from .peertube import PeerTubeIE
+from .indavideo import IndavideoEmbedIE
+from .apa import APAIE
+from .foxnews import FoxNewsIE
  
  
  class GenericIE(InfoExtractor):
@@ -1391,17 +1396,6 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
-        # SVT embed
-        {
-            'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
-            'info_dict': {
-                'id': '2900353',
-                'ext': 'flv',
-                'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
-                'duration': 27,
-                'age_limit': 0,
-            },
-        },
          # Crooks and Liars embed
          {
              'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -2012,6 +2006,50 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        {
+            # PeerTube embed
+            'url': 'https://joinpeertube.org/fr/home/',
+            'info_dict': {
+                'id': 'home',
+                'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # Indavideo embed
+            'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/',
+            'info_dict': {
+                'id': '1693903',
+                'ext': 'mp4',
+                'title': 'Így kell otthon hamburgert sütni',
+                'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7',
+                'timestamp': 1426330212,
+                'upload_date': '20150314',
+                'uploader': 'StreetKitchen',
+                'uploader_id': '546363',
+            },
+            'add_ie': [IndavideoEmbedIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # APA embed via JWPlatform embed
+            'url': 'http://www.vol.at/blue-man-group/5593454',
+            'info_dict': {
+                'id': 'jjv85FdZ',
+                'ext': 'mp4',
+                'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 254,
+                'timestamp': 1519211149,
+                'upload_date': '20180221',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          {
              'url': 'http://share-videos.se/auto/video/83645793?uid=13',
              'md5': 'b68d276de422ab07ee1d49388103f457',
@@ -3029,6 +3067,26 @@ class GenericIE(InfoExtractor):
              return self.playlist_from_matches(
                  cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
  
+        peertube_urls = PeerTubeIE._extract_urls(webpage, url)
+        if peertube_urls:
+            return self.playlist_from_matches(
+                peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
+
+        indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
+        if indavideo_urls:
+            return self.playlist_from_matches(
+                indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key())
+
+        apa_urls = APAIE._extract_urls(webpage)
+        if apa_urls:
+            return self.playlist_from_matches(
+                apa_urls, video_id, video_title, ie=APAIE.ie_key())
+
+        foxnews_urls = FoxNewsIE._extract_urls(webpage)
+        if foxnews_urls:
+            return self.playlist_from_matches(
+                foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
+
          sharevideos_urls = [mobj.group('url') for mobj in re.finditer(
              r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
              webpage)]
@@ -3073,8 +3131,8 @@ class GenericIE(InfoExtractor):
                  sources = [sources]
              formats = []
              for source in sources:
-                src = source.get('src')
-                if not src or not isinstance(src, compat_str):
+                src = url_or_none(source.get('src'))
+                if not src:
                      continue
                  src = compat_urlparse.urljoin(url, src)
                  src_type = source.get('type')