Merge remote-tracking branch 'origin/master' into HEAD
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
Conflicts:
youtube_dl/extractor/arte.py

test/test_utils.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/clipfish.py
youtube_dl/extractor/rts.py
youtube_dl/extractor/washingtonpost.py [new file with mode: 0644]
youtube_dl/utils.py

index e920d661f6e520afcc4bcb0308e7faac2370e92a..2348c04159e54cb5fe293fcd300c2f25116588a6 100644 (file)
@@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 # Various small unit tests
 import io
+import json
 import xml.etree.ElementTree
 
 #from youtube_dl.utils import htmlentity_transform
@@ -36,6 +37,7 @@ from youtube_dl.utils import (
     urlencode_postdata,
     xpath_with_ns,
     parse_iso8601,
+    strip_jsonp,
 )
 
 if sys.version_info < (3, 0):
@@ -272,5 +274,11 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
         self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
 
+    def test_strip_jsonp(self):
+        stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
+        d = json.loads(stripped)
+        self.assertEqual(d, [{"id": "532cb", "x": 3}])
+
+
 if __name__ == '__main__':
     unittest.main()
index 39b250b103a2401a3138e5abf8d4b01b2a3d0a93..56b382aed637de7076591bb10bda1b7659298e09 100644 (file)
@@ -282,6 +282,7 @@ from .vine import VineIE
 from .viki import VikiIE
 from .vk import VKIE
 from .vube import VubeIE
+from .washingtonpost import WashingtonPostIE
 from .wat import WatIE
 from .wdr import WDRIE
 from .weibo import WeiboIE
index 2a123880e62c4154d9772ded5fb6775de50509e2..646377e4b0f16c9b3628987b51c3ff530c12b297 100644 (file)
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 from ..utils import (
@@ -26,8 +25,8 @@ class ArteTvIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
         lang = mobj.group('lang')
+        video_id = mobj.group('id')
 
         ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
         ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
index 43efb08bfc33accf5661bf0afa3d59aeb1bb0c0e..669919a2cc9039ffb91ae052b96d5531665341e0 100644 (file)
@@ -1,22 +1,28 @@
+from __future__ import unicode_literals
+
 import re
 import time
 import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
 
 
 class ClipfishIE(InfoExtractor):
-    IE_NAME = u'clipfish'
+    IE_NAME = 'clipfish'
 
     _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
     _TEST = {
-        u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
-        u'file': u'3966754.mp4',
-        u'md5': u'2521cd644e862936cf2e698206e47385',
-        u'info_dict': {
-            u'title': u'FIFA 14 - E3 2013 Trailer',
-            u'duration': 82,
+        'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+        'md5': '2521cd644e862936cf2e698206e47385',
+        'info_dict': {
+            'id': '3966754',
+            'ext': 'mp4',
+            'title': 'FIFA 14 - E3 2013 Trailer',
+            'duration': 82,
         },
         u'skip': 'Blocked in the US'
     }
@@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
         video_url = doc.find('filename').text
         if video_url is None:
             xml_bytes = xml.etree.ElementTree.tostring(doc)
-            raise ExtractorError(u'Cannot find video URL in document %r' %
+            raise ExtractorError('Cannot find video URL in document %r' %
                                  xml_bytes)
         thumbnail = doc.find('imageurl').text
-        duration_str = doc.find('duration').text
-        m = re.match(
-            r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
-            duration_str)
-        if m:
-            duration = (
-                (int(m.group('hours')) * 60 * 60) +
-                (int(m.group('minutes')) * 60) +
-                (int(m.group('seconds')))
-            )
-        else:
-            duration = None
+        duration = parse_duration(doc.find('duration').text)
 
         return {
             'id': video_id,
index f211637a74fe0056a9553f1ed17da4b9f688f5ba..bcdfbdd5691410e43d3b030f4f8059cfc8497c6a 100644 (file)
@@ -28,6 +28,7 @@ class RTSIE(InfoExtractor):
             'uploader': 'Divers',
             'upload_date': '19680921',
             'timestamp': -40280400,
+            'thumbnail': 're:^https?://.*\.image'
         },
     }
 
@@ -58,4 +59,5 @@ class RTSIE(InfoExtractor):
             'duration': duration,
             'uploader': info.get('programName'),
             'timestamp': upload_timestamp,
+            'thumbnail': thumbnail,
         }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
new file mode 100644 (file)
index 0000000..cb8f088
--- /dev/null
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+    _TEST = {
+        'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+        'playlist': [{
+            'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+            'info_dict': {
+                'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'Breaking Points: The Paper Mine',
+                'duration': 1287,
+                'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+                'uploader': 'The Washington Post',
+                'timestamp': 1395527908,
+                'upload_date': '20140322',
+            },
+        }, {
+            'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+            'info_dict': {
+                'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'The town bureaucracy sustains',
+                'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+                'duration': 2217,
+                'timestamp': 1395528005,
+                'upload_date': '20140322',
+                'uploader': 'The Washington Post',
+            },
+        }]
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, page_id)
+        title = self._og_search_title(webpage)
+        uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
+        entries = []
+        for i, uuid in enumerate(uuids, start=1):
+            vinfo_all = self._download_json(
+                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
+                page_id,
+                transform_source=strip_jsonp,
+                note='Downloading information of video %d/%d' % (i, len(uuids))
+            )
+            vinfo = vinfo_all[0]['contentConfig']
+            uploader = vinfo.get('credits', {}).get('source')
+            timestamp = int_or_none(
+                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
+
+            formats = [{
+                'format_id': (
+                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
+                    if s.get('width')
+                    else s.get('type')),
+                'vbr': s.get('bitrate') if s.get('width') != 0 else None,
+                'width': s.get('width'),
+                'height': s.get('height'),
+                'acodec': s.get('audioCodec'),
+                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
+                'filesize': s.get('fileSize'),
+                'url': s.get('url'),
+                'ext': 'mp4',
+                'protocol': {
+                    'MP4': 'http',
+                    'F4F': 'f4m',
+                }.get(s.get('type'))
+            } for s in vinfo.get('streams', [])]
+            source_media_url = vinfo.get('sourceMediaURL')
+            if source_media_url:
+                formats.append({
+                    'format_id': 'source_media',
+                    'url': source_media_url,
+                })
+            self._sort_formats(formats)
+            entries.append({
+                'id': uuid,
+                'title': vinfo['title'],
+                'description': vinfo.get('blurb'),
+                'uploader': uploader,
+                'formats': formats,
+                'duration': int_or_none(vinfo.get('videoDuration'), 100),
+                'timestamp': timestamp,
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': page_id,
+            'title': title,
+        }
index 68d590ba25f812ba96f0c97d81f932200059a155..29c9b1a4cb6fdef970617fd6b556454cb7eff07d 100644 (file)
@@ -1186,7 +1186,7 @@ def parse_duration(s):
         return None
 
     m = re.match(
-        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
+        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
     if not m:
         return None
     res = int(m.group('secs'))
@@ -1328,3 +1328,7 @@ US_RATINGS = {
     'R': 16,
     'NC': 18,
 }
+
+
+def strip_jsonp(code):
+    return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)