[ndtv] Add extractor (Fixes #1924)
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 18:39:41 +0000 (19:39 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 18:44:33 +0000 (19:44 +0100)
youtube_dl/extractor/__init__.py
youtube_dl/extractor/ndtv.py [new file with mode: 0644]
youtube_dl/utils.py

index 3f740baa13ff8c2c5f6891cc32042ed14b10188c..1149dc1ec497b6cfffea1da3db70dea39469f4b3 100644 (file)
@@ -100,6 +100,7 @@ from .myvideo import MyVideoIE
 from .naver import NaverIE
 from .nba import NBAIE
 from .nbc import NBCNewsIE
 from .naver import NaverIE
 from .nba import NBAIE
 from .nbc import NBCNewsIE
+from .ndtv import NDTVIE
 from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py
new file mode 100644 (file)
index 0000000..2e8501f
--- /dev/null
@@ -0,0 +1,66 @@
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import month_by_name
+
+
+class NDTVIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+
+    _TEST = {
+        u"url": u"http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710",
+        u"file": u"300710.mp4",
+        u"md5": u"39f992dbe5fb531c395d8bbedb1e5e88",
+        u"info_dict": {
+            u"title": u"NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal",
+            u"description": u"In an exclusive interview to NDTV, Aam Aadmi Party's Arvind Kejriwal says it makes no difference to him that Rahul Gandhi said the Congress needs to learn from his party.",
+            u"upload_date": u"20131208",
+            u"duration": 1327,
+            u"thumbnail": u"http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg",
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        filename = self._search_regex(
+            r"__filename='([^']+)'", webpage, u'video filename')
+        video_url = (u'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
+                     filename)
+
+        duration_str = filename = self._search_regex(
+            r"__duration='([^']+)'", webpage, u'duration', fatal=False)
+        duration = None if duration_str is None else int(duration_str)
+
+        date_m = re.search(r'''(?x)
+            <p\s+class="vod_dateline">\s*
+                Published\s+On:\s*
+                (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
+            ''', webpage)
+        upload_date = None
+        assert date_m
+        if date_m is not None:
+            month = month_by_name(date_m.group('monthname'))
+            if month is not None:
+                upload_date = '%s%02d%02d' % (
+                    date_m.group('year'), month, int(date_m.group('day')))
+
+        description = self._og_search_description(webpage)
+        READ_MORE = u' (Read more)'
+        if description.endswith(READ_MORE):
+            description = description[:-len(READ_MORE)]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': duration,
+            'upload_date': upload_date,
+        }
index 64300d8e0e1bcd36ffa23c4a9b5d8345f8fe7d4a..0dab9fcc5d7ad8b5d0fcae42b38bf5ac26acdb64 100644 (file)
@@ -1045,3 +1045,15 @@ def get_term_width():
     except:
         pass
     return None
     except:
         pass
     return None
+
+
+def month_by_name(name):
+    """ Return the number of a month by (locale-independently) English name """
+
+    ENGLISH_NAMES = [
+        u'Januar', u'February', u'March', u'April', u'May', u'June',
+        u'July', u'August', u'September', u'October', u'November', u'December']
+    try:
+        return ENGLISH_NAMES.index(name) + 1
+    except ValueError:
+        return None