[youtube] Fix extraction.

[youtube-dl] / youtube_dl / extractor / tagesschau.py
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py

index f6102c22431460c2ab6df554b17f4a57ce1c3974..c351b754594a08be2f585f901c3a71ac425bcfd7 100644 (file)
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
  from __future__ import unicode_literals
  
  import re
@@ -23,7 +23,7 @@ class TagesschauPlayerIE(InfoExtractor):
              'id': '179517',
              'ext': 'mp4',
              'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
              'formats': 'mincount:6',
          },
      }, {
@@ -33,7 +33,7 @@ class TagesschauPlayerIE(InfoExtractor):
              'id': '29417',
              'ext': 'mp3',
              'title': 'Trabi - Bye, bye Rennpappe',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
              'formats': 'mincount:2',
          },
      }, {
@@ -125,54 +125,54 @@ class TagesschauPlayerIE(InfoExtractor):
  
  
  class TagesschauIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)?)(?:~_?[^/#?]+?)?\.html'
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
  
      _TESTS = [{
          'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
          'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
          'info_dict': {
-            'id': '102143',
+            'id': 'video-102143',
              'ext': 'mp4',
              'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
              'description': '18.07.2015 20:10 Uhr',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
          },
      }, {
          'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
          'md5': '3c54c1f6243d279b706bde660ceec633',
          'info_dict': {
-            'id': '5727',
+            'id': 'ts-5727',
              'ext': 'mp4',
              'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
              'description': 'md5:695c01bfd98b7e313c501386327aea59',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
          },
      }, {
          # exclusive audio
          'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
          'md5': '76e6eec6ebd40740671cf0a2c88617e5',
          'info_dict': {
-            'id': '29417',
+            'id': 'audio-29417',
              'ext': 'mp3',
              'title': 'Trabi - Bye, bye Rennpappe',
              'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
          },
      }, {
          # audio in article
          'url': 'http://www.tagesschau.de/inland/bnd-303.html',
          'md5': 'e0916c623e85fc1d2b26b78f299d3958',
          'info_dict': {
-            'id': '303',
+            'id': 'bnd-303',
              'ext': 'mp3',
              'title': 'Viele Baustellen für neuen BND-Chef',
              'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
-            'thumbnail': 're:^https?:.*\.jpg$',
+            'thumbnail': r're:^https?:.*\.jpg$',
          },
      }, {
          'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
          'info_dict': {
-            'id': '135',
+            'id': 'afd-parteitag-135',
              'title': 'Möchtegern-Underdog mit Machtanspruch',
          },
          'playlist_count': 2,
@@ -200,6 +200,10 @@ class TagesschauIE(InfoExtractor):
      }, {
          'url': 'http://www.tagesschau.de/100sekunden/index.html',
          'only_matching': True,
+    }, {
+        # playlist article with collapsing sections
+        'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -275,7 +279,7 @@ class TagesschauIE(InfoExtractor):
          if webpage_type == 'website':  # Article
              entries = []
              for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
-                    r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
+                    r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
                      webpage), 1):
                  entries.append({
                      'id': '%s-%d' % (display_id, num),