Merge remote-tracking branch 'sagittarian/vimeo-no-desc'

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 1bd9e25c4390018fa2a8e97b41653e5623fe96c1..0807306609bcde085046b68b8486b5b8f40a1d11 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1130,7 +1130,7 @@ class VimeoIE(InfoExtractor):
          # Extract video description
          video_description = get_element_by_attribute("itemprop", "description", webpage)
          if video_description: video_description = clean_html(video_description)
-        else: video_description = ''
+        else: video_description = u''
  
          # Extract upload date
          video_upload_date = None
@@ -1722,9 +1722,7 @@ class YoutubePlaylistIE(InfoExtractor):
                          (?:
                             (?:course|view_play_list|my_playlists|artist|playlist|watch)
                             \? (?:.*?&)*? (?:p|a|list)=
-                        |  user/.*?/user/
                          |  p/
-                        |  user/.*?#[pg]/c/
                          )
                          ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
                          .*
@@ -3808,7 +3806,7 @@ class WorldStarHipHopIE(InfoExtractor):
          _title = r"""<title>(.*)</title>"""
  
          mobj = re.search(_title, webpage_src)
-        
+
          if mobj is not None:
              title = mobj.group(1)
          else:
@@ -3826,7 +3824,7 @@ class WorldStarHipHopIE(InfoExtractor):
              if mobj is not None:
                  title = mobj.group(1)
              thumbnail = None
-        
+
          results = [{
                      'id': video_id,
                      'url' : video_url,
@@ -4358,6 +4356,46 @@ class LiveLeakIE(InfoExtractor):
  
          return [info]
  
+class ARDIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
+
+    def _real_extract(self, url):
+        # determine video id from url
+        m = re.match(self._VALID_URL, url)
+
+        numid = re.search(r'documentId=([0-9]+)', url)
+        if numid:
+            video_id = numid.group(1)
+        else:
+            video_id = m.group('video_id')
+
+        # determine title and media streams from webpage
+        html = self._download_webpage(url, video_id)
+        title = re.search(self._TITLE, html).group('title')
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if not streams:
+            assert '"fsk"' in html
+            self._downloader.report_error(u'this video is only available after 8:00 pm')
+            return
+
+        # choose default media type and highest quality for now
+        stream = max([s for s in streams if int(s["media_type"]) == 0],
+                     key=lambda s: int(s["quality"]))
+
+        # there's two possibilities: RTMP stream or HTTP download
+        info = {'id': video_id, 'title': title, 'ext': 'mp4'}
+        if stream['rtmp_url']:
+            self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
+            assert stream['video_url'].startswith('mp4:')
+            info["url"] = stream["rtmp_url"]
+            info["play_path"] = stream['video_url']
+        else:
+            assert stream["video_url"].endswith('.mp4')
+            info["url"] = stream["video_url"]
+        return [info]
+
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4411,5 +4449,6 @@ def gen_extractors():
          MySpassIE(),
          SpiegelIE(),
          LiveLeakIE(),
+        ARDIE(),
          GenericIE()
      ]