[prosiebensat1] Fix broken tests
[youtube-dl] / youtube_dl / extractor / prosiebensat1.py
index 3f585bebf86d68abec429aa6f3d9c91e5a9f028b..8c9451f4053036e0575ed5551b137fe6d168b166 100644 (file)
@@ -8,8 +8,7 @@ from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
     unified_strdate,
-    clean_html,
-    RegexNotFoundError,
+    ExtractorError,
 )
 
 
@@ -87,7 +86,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'Im Interview: Kai Wiesinger',
                 'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
-                'upload_date': '20140225',
+                'upload_date': '20140203',
                 'duration': 522.56,
             },
             'params': {
@@ -102,7 +101,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
                 'description': 'md5:2669cde3febe9bce13904f701e774eb6',
-                'upload_date': '20140225',
+                'upload_date': '20141014',
                 'duration': 2410.44,
             },
             'params': {
@@ -146,8 +145,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 'id': '2156342',
                 'ext': 'mp4',
                 'title': 'Kurztrips zum Valentinstag',
-                'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
-                'upload_date': '20130206',
+                'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
                 'duration': 307.24,
             },
             'params': {
@@ -155,23 +153,34 @@ class ProSiebenSat1IE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist',
+            'info_dict': {
+                'id': '439664',
+                'title': 'Episode 8 - Ganze Folge - Playlist',
+                'description': 'Das finale und härteste Duell aller Zeiten ist vorbei! Der Weltmeister für dieses Jahr steht! Alle packenden Duelle der achten Episode von "Joko gegen Klaas - das Duell um die Welt" seht ihr hier noch einmal in voller Länge!',
+            },
+            'playlist_count': 2,
+        },
     ]
 
     _CLIPID_REGEXES = [
         r'"clip_id"\s*:\s+"(\d+)"',
         r'clipid: "(\d+)"',
+        r'clip[iI]d=(\d+)',
+        r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
     ]
     _TITLE_REGEXES = [
         r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
         r'<header class="clearfix">\s*<h3>(.+?)</h3>',
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
-        r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
+        r'<h1 class="att-name">\s*(.+?)</h1>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
         r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
         r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
-        r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
+        r'<p class="att-description">\s*(.+?)\s*</p>',
     ]
     _UPLOAD_DATE_REGEXES = [
         r'<meta property="og:published_time" content="(.+?)">',
@@ -180,23 +189,49 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
         r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
     ]
+    _ITEM_TYPE_REGEXES = [
+        r"'itemType'\s*:\s*'([^']*)'",
+    ]
+    _ITEM_ID_REGEXES = [
+        r"'itemId'\s*:\s*'([^']*)'",
+    ]
+    _PLAYLIST_CLIPS_REGEXES = [
+        r'data-qvt=.+?<a href="([^"]+)"',
+    ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        item_type = self._html_search_regex(self._ITEM_TYPE_REGEXES, webpage, 'item type', default='CLIP')
+        if item_type == 'CLIP':
+            return self._clip_extract(url, webpage)
+        elif item_type == 'PLAYLIST':
+            playlist_id = self._html_search_regex(self._ITEM_ID_REGEXES, webpage, 'playlist id')
+
+            for regex in self._PLAYLIST_CLIPS_REGEXES:
+                playlist_clips = re.findall(regex, webpage, re.DOTALL)
+                if playlist_clips:
+                    title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
+                    description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+                    root_url = re.match('(.+?//.+?)/', url).group(1)
 
-        page = self._download_webpage(url, video_id, 'Downloading page')
+                    return {
+                        '_type': 'playlist',
+                        'id': playlist_id,
+                        'title': title,
+                        'description': description,
+                        'entries': [self._clip_extract(root_url + clip_path) for clip_path in playlist_clips]
+                    }
+        else:
+            raise ExtractorError('Unknown item type "%s"' % item_type)
 
-        def extract(patterns, name, page, fatal=False):
-            for pattern in patterns:
-                mobj = re.search(pattern, page)
-                if mobj:
-                    return clean_html(mobj.group(1))
-            if fatal:
-                raise RegexNotFoundError(u'Unable to extract %s' % name)
-            return None
+    def _clip_extract(self, url, webpage=None):
+        if webpage is None:
+            video_id = self._match_id(url)
+            webpage = self._download_webpage(url, video_id)
 
-        clip_id = extract(self._CLIPID_REGEXES, 'clip id', page, fatal=True)
+        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
         access_token = 'testclient'
         client_name = 'kolibri-1.2.5'
@@ -245,13 +280,12 @@ class ProSiebenSat1IE(InfoExtractor):
 
         urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
 
-        title = extract(self._TITLE_REGEXES, 'title', page, fatal=True)
-        description = extract(self._DESCRIPTION_REGEXES, 'description', page)
-        thumbnail = self._og_search_thumbnail(page)
+        title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
+        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
 
-        upload_date = extract(self._UPLOAD_DATE_REGEXES, 'upload date', page)
-        if upload_date:
-            upload_date = unified_strdate(upload_date)
+        upload_date = unified_strdate(self._html_search_regex(
+            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
 
         formats = []
 
@@ -260,7 +294,7 @@ class ProSiebenSat1IE(InfoExtractor):
             urls_sources = urls_sources.values()
 
         def fix_bitrate(bitrate):
-            return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+            return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
 
         for source in urls_sources:
             protocol = source['protocol']
@@ -294,4 +328,4 @@ class ProSiebenSat1IE(InfoExtractor):
             'upload_date': upload_date,
             'duration': duration,
             'formats': formats,
-        }
\ No newline at end of file
+        }