[generic] Add support for RSS feeds (Fixes #667)
[youtube-dl] / youtube_dl / extractor / generic.py
index 082da9c77b138f35651e2de8a2d035f8fee1f517..30160d59d4218601a7bff538f6fd96cc130bd7a4 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 
 import os
 import re
+import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from .youtube import YoutubeIE
@@ -159,6 +160,25 @@ class GenericIE(InfoExtractor):
             raise ExtractorError('Invalid URL protocol')
         return response
 
+    def _extract_rss(self, url, video_id, doc):
+        playlist_title = doc.find('./channel/title').text
+        playlist_desc_el = doc.find('./channel/description')
+        playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+        entries = [{
+            '_type': 'url',
+            'url': e.find('link').text,
+            'title': e.find('title').text,
+        } for e in doc.findall('./channel/item')]
+
+        return {
+            '_type': 'playlist',
+            'id': url,
+            'title': playlist_title,
+            'description': playlist_desc,
+            'entries': entries,
+        }
+
     def _real_extract(self, url):
         parsed_url = compat_urlparse.urlparse(url)
         if not parsed_url.scheme:
@@ -219,6 +239,14 @@ class GenericIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
+        # Is it an RSS feed?
+        try:
+            doc = xml.etree.ElementTree.fromstring(webpage)
+            if doc.tag == 'rss':
+                return self._extract_rss(url, video_id, doc)
+        except xml.etree.ElementTree.ParseError:
+            pass
+
         # it's tempting to parse this further, but you would
         # have to take into account all the variations like
         #   Video Title - Site Name
@@ -234,11 +262,21 @@ class GenericIE(InfoExtractor):
             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
 
         # Look for BrightCove:
-        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
-        if bc_url is not None:
+        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+        if bc_urls:
             self.to_screen('Brightcove video detected.')
-            surl = smuggle_url(bc_url, {'Referer': url})
-            return self.url_result(surl, 'Brightcove')
+            entries = [{
+                '_type': 'url',
+                'url': smuggle_url(bc_url, {'Referer': url}),
+                'ie_key': 'Brightcove'
+            } for bc_url in bc_urls]
+
+            return {
+                '_type': 'playlist',
+                'title': video_title,
+                'id': video_id,
+                'entries': entries,
+            }
 
         # Look for embedded (iframe) Vimeo player
         mobj = re.search(