Add Ustream channel support
authortewe <tewe@github>
Thu, 12 Sep 2013 10:30:14 +0000 (12:30 +0200)
committertewe <tewe@github>
Thu, 12 Sep 2013 10:30:14 +0000 (12:30 +0200)
youtube_dl/extractor/__init__.py
youtube_dl/extractor/ustream.py

index 26cf249354722461846cbb8c117cbd3d8bffcbfc..a7cddef733bc611ccf7f6ea5ae6db0ae9613903b 100644 (file)
@@ -96,7 +96,7 @@ from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tutv import TutvIE
 from .unistra import UnistraIE
 from .tumblr import TumblrIE
 from .tutv import TutvIE
 from .unistra import UnistraIE
-from .ustream import UstreamIE
+from .ustream import UstreamIE, UstreamChannelIE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
index 5f423870abb6c9e337f7e0c328b16ed3ef5049ab..16cdcc76592feb03ad9b30b962e66041eb391168 100644 (file)
@@ -1,4 +1,7 @@
+from HTMLParser import HTMLParser
+import json
 import re
 import re
+from urlparse import urljoin
 
 from .common import InfoExtractor
 
 
 from .common import InfoExtractor
 
@@ -43,3 +46,70 @@ class UstreamIE(InfoExtractor):
                 'thumbnail': thumbnail,
                }
         return info
                 'thumbnail': thumbnail,
                }
         return info
+
+# More robust than regular expressions
+
+class ChannelParser(HTMLParser):
+    """
+    <meta name="ustream:channel_id" content="1234">
+    """
+    channel_id = None
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'meta':
+            return
+        values = dict(attrs)
+        if values.get('name') != 'ustream:channel_id':
+            return
+        value = values.get('content', '')
+        if value.isdigit():
+            self.channel_id = value
+
+class SocialstreamParser(HTMLParser):
+    """
+    <li class="content123 video" data-content-id="123" data-length="1452"
+        data-href="/recorded/123" data-og-url="/recorded/123">
+    """
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.content_ids = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'li':
+            return
+        for (attr, value) in attrs:
+            if attr == 'data-content-id' and value.isdigit():
+                self.content_ids.append(value)
+
+class UstreamChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
+    IE_NAME = u'ustream:channel'
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        slug = m.group('slug')
+        # Slugs can be non-ascii, but youtube-dl can't handle non-ascii command lines,
+        # so if we got this far it's probably percent encoded and we needn't worry.
+
+        p = ChannelParser()
+        p.feed(self._download_webpage(url, slug))
+        p.close()
+        channel_id = p.channel_id
+
+        p = SocialstreamParser()
+        BASE = 'http://www.ustream.tv'
+        next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
+        while next_url:
+            reply = json.loads(self._download_webpage(urljoin(BASE, next_url), channel_id))
+            p.feed(reply['data'])
+            next_url = reply['nextUrl']
+        p.close()
+        video_ids = p.content_ids
+
+        # From YoutubeChannelIE
+
+        self._downloader.to_screen(u'[ustream] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
+
+        urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
+        url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
+        return [self.playlist_result(url_entries, channel_id)]