From: Philipp Hagemeister <phihag@phihag.de>
Date: Mon, 2 Feb 2015 20:48:54 +0000 (+0100)
Subject: [ntvde] Add new extractor (Fixes #4850)
X-Git-Url: http://git.bitcoin.ninja/?a=commitdiff_plain;h=8f4b58d70e29df13d6002579b5c9448acc89d6c1;p=youtube-dl

[ntvde] Add new extractor (Fixes #4850)
---

diff --git a/test/test_utils.py b/test/test_utils.py
index ebec7986f..0ffccd35f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -238,6 +238,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_duration('5 s'), 5)
         self.assertEqual(parse_duration('3 min'), 180)
         self.assertEqual(parse_duration('2.5 hours'), 9000)
+        self.assertEqual(parse_duration('02:03:04'), 7384)
+        self.assertEqual(parse_duration('01:02:03:04'), 93784)
 
     def test_fix_xml_ampersands(self):
         self.assertEqual(
@@ -371,6 +373,16 @@ class TestUtil(unittest.TestCase):
         on = js_to_json('{"abc": true}')
         self.assertEqual(json.loads(on), {'abc': True})
 
+        # Ignore JavaScript code as well
+        on = js_to_json('''{
+            "x": 1,
+            y: "a",
+            z: some.code
+        }''')
+        d = json.loads(on)
+        self.assertEqual(d['x'], 1)
+        self.assertEqual(d['y'], 'a')
+
     def test_clean_html(self):
         self.assertEqual(clean_html('a:\nb'), 'a: b')
         self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"')
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index bd707e028..205bd4338 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -317,6 +317,7 @@ from .nrk import (
     NRKIE,
     NRKTVIE,
 )
+from .ntvde import NTVDeIE
 from .ntvru import NTVRuIE
 from .nytimes import NYTimesIE
 from .nuvid import NuvidIE
diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py
new file mode 100644
index 000000000..a4f628fcd
--- /dev/null
+++ b/youtube_dl/extractor/ntvde.py
@@ -0,0 +1,65 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
+
+
+class NTVDeIE(InfoExtractor):
+    IE_NAME = 'n-tv.de'
+    _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html',
+        'md5': 'd37b7df1eea32265c51a062499ca488f',
+        'info_dict': {
+            'id': '14438086',
+            'ext': 'mp4',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'title': 'Schnee und GlÃ¤tte fÃ¼hren zu zahlreichen UnfÃ¤llen und Staus',
+            'alt_title': 'Winterchaos auf deutschen StraÃen',
+            'description': 'Schnee und GlÃ¤tte sorgen deutschlandweit fÃ¼r einen chaotischen Start in die Woche: Auf den StraÃen kommt es zu kilometerlangen Staus und Dutzenden GlÃ¤tteunfÃ¤llen. In DÃ¼sseldorf und MÃ¼nchen wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende FlÃ¼ge landen zu spÃ¤t, einige fallen ganz aus.',
+            'duration': 4020,
+            'timestamp': 1422892797,
+            'upload_date': '20150202',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        info = self._parse_json(self._search_regex(
+            r'(?s)ntv.pageInfo.article =\s(\{.*?\});', webpage, 'info'),
+            video_id, transform_source=js_to_json)
+        timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp'))
+        vdata = self._parse_json(self._search_regex(
+            r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);',
+            webpage, 'player data'),
+            video_id, transform_source=js_to_json)
+        duration = parse_duration(vdata.get('duration'))
+        formats = [{
+            'format_id': 'flash',
+            'url': 'rtmp://fms.n-tv.de/' + vdata['video'],
+        }, {
+            'format_id': 'mp4',
+            'url': 'http://video.n-tv.de' + vdata['videoMp4'],
+        }]
+        m3u8_url = 'http://video.n-tv.de' + vdata['videoM3u8']
+        formats.extend(self._extract_m3u8_formats(m3u8_url, video_id))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['headline'],
+            'description': info.get('intro'),
+            'alt_title': info.get('kicker'),
+            'timestamp': timestamp,
+            'thumbnail': vdata.get('html5VideoPoster'),
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py
index 8b9166d48..963bb715a 100644
--- a/youtube_dl/extractor/ntvru.py
+++ b/youtube_dl/extractor/ntvru.py
@@ -10,6 +10,7 @@ from ..utils import (
 
 
 class NTVRuIE(InfoExtractor):
+    IE_NAME = 'ntv.ru'
     _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
 
     _TESTS = [
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 4ade0554e..251074bf5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1275,7 +1275,10 @@ def parse_duration(s):
             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
 
             (?:
-                (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
+                (?:
+                    (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
+                    (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
+                )?
                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
             )?
             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
@@ -1293,6 +1296,8 @@ def parse_duration(s):
         res += int(m.group('mins')) * 60
     if m.group('hours'):
         res += int(m.group('hours')) * 60 * 60
+    if m.group('days'):
+        res += int(m.group('days')) * 24 * 60 * 60
     if m.group('ms'):
         res += float(m.group('ms'))
     return res
@@ -1543,7 +1548,7 @@ def js_to_json(code):
     res = re.sub(r'''(?x)
         "(?:[^"\\]*(?:\\\\|\\")?)*"|
         '(?:[^'\\]*(?:\\\\|\\')?)*'|
-        [a-zA-Z_][a-zA-Z_0-9]*
+        [a-zA-Z_][.a-zA-Z_0-9]*
         ''', fix_kv, code)
     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
     return res