[generic] Add support for RSS feeds (Fixes #667)

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index e1933837d143ccec1550b2d6b9c7595c23096b67..30160d59d4218601a7bff538f6fd96cc130bd7a4 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
  
  import os
  import re
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from .youtube import YoutubeIE
@@ -38,18 +39,6 @@ class GenericIE(InfoExtractor):
                  'title': 'R\u00e9gis plante sa Jeep',
              }
          },
-        # embedded vimeo video
-        {
-            'add_ie': ['Vimeo'],
-            'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
-            'file': '22444065.mp4',
-            'md5': '2903896e23df39722c33f015af0666e2',
-            'info_dict': {
-                'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
-                'uploader_id': 'skillsmatter',
-                'uploader': 'Skills Matter',
-            }
-        },
          # bandcamp page with custom domain
          {
              'add_ie': ['Bandcamp'],
@@ -78,6 +67,18 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        {
+            # https://github.com/rg3/youtube-dl/issues/2253
+            'url': 'http://bcove.me/i6nfkrc3',
+            'file': '3101154703001.mp4',
+            'md5': '0ba9446db037002366bab3b3eb30c88c',
+            'info_dict': {
+                'title': 'Still no power',
+                'uploader': 'thestar.com',
+                'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
+            },
+            'add_ie': ['Brightcove'],
+        },
          # Direct link to a video
          {
              'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -159,6 +160,25 @@ class GenericIE(InfoExtractor):
              raise ExtractorError('Invalid URL protocol')
          return response
  
+    def _extract_rss(self, url, video_id, doc):
+        playlist_title = doc.find('./channel/title').text
+        playlist_desc_el = doc.find('./channel/description')
+        playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+        entries = [{
+            '_type': 'url',
+            'url': e.find('link').text,
+            'title': e.find('title').text,
+        } for e in doc.findall('./channel/item')]
+
+        return {
+            '_type': 'playlist',
+            'id': url,
+            'title': playlist_title,
+            'description': playlist_desc,
+            'entries': entries,
+        }
+
      def _real_extract(self, url):
          parsed_url = compat_urlparse.urlparse(url)
          if not parsed_url.scheme:
@@ -219,6 +239,14 @@ class GenericIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
+        # Is it an RSS feed?
+        try:
+            doc = xml.etree.ElementTree.fromstring(webpage)
+            if doc.tag == 'rss':
+                return self._extract_rss(url, video_id, doc)
+        except xml.etree.ElementTree.ParseError:
+            pass
+
          # it's tempting to parse this further, but you would
          # have to take into account all the variations like
          #   Video Title - Site Name
@@ -234,15 +262,25 @@ class GenericIE(InfoExtractor):
              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
  
          # Look for BrightCove:
-        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
-        if bc_url is not None:
+        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+        if bc_urls:
              self.to_screen('Brightcove video detected.')
-            surl = smuggle_url(bc_url, {'Referer': url})
-            return self.url_result(surl, 'Brightcove')
+            entries = [{
+                '_type': 'url',
+                'url': smuggle_url(bc_url, {'Referer': url}),
+                'ie_key': 'Brightcove'
+            } for bc_url in bc_urls]
+
+            return {
+                '_type': 'playlist',
+                'title': video_title,
+                'id': video_id,
+                'entries': entries,
+            }
  
          # Look for embedded (iframe) Vimeo player
          mobj = re.search(
-            r'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage)
+            r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
          if mobj:
              player_url = unescapeHTML(mobj.group(1))
              surl = smuggle_url(player_url, {'Referer': url})
@@ -250,7 +288,7 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded (swf embed) Vimeo player
          mobj = re.search(
-            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage)
+            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
          if mobj:
              return self.url_result(mobj.group(1), 'Vimeo')
  
@@ -320,7 +358,7 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group(1), 'Aparat')
  
          # Look for MPORA videos
-        mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage)
+        mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
          if mobj is not None:
              return self.url_result(mobj.group(1), 'Mpora')
  
@@ -332,15 +370,21 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded Facebook player
          mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'Facebook')
  
+        # Look for embedded Huffington Post player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'HuffPost')
+
          # Start with something easy: JW Player in SWFObject
          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if mobj is None:
              # Look for gorilla-vid style embedding
-            mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage)
+            mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage)
          if mobj is None:
              # Broaden the search a little bit
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)