Merge remote-tracking branch 'origin/master' into HEAD

author Philipp Hagemeister <phihag@phihag.de>

Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
diff --git a/test/test_utils.py b/test/test_utils.py

index e920d661f6e520afcc4bcb0308e7faac2370e92a..2348c04159e54cb5fe293fcd300c2f25116588a6 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
  # Various small unit tests
  import io
+import json
  import xml.etree.ElementTree
  
  #from youtube_dl.utils import htmlentity_transform
@@ -36,6 +37,7 @@ from youtube_dl.utils import (
      urlencode_postdata,
      xpath_with_ns,
      parse_iso8601,
+    strip_jsonp,
  )
  
  if sys.version_info < (3, 0):
@@ -272,5 +274,11 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
          self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
  
+    def test_strip_jsonp(self):
+        stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
+        d = json.loads(stripped)
+        self.assertEqual(d, [{"id": "532cb", "x": 3}])
+
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 39b250b103a2401a3138e5abf8d4b01b2a3d0a93..56b382aed637de7076591bb10bda1b7659298e09 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -282,6 +282,7 @@ from .vine import VineIE
  from .viki import VikiIE
  from .vk import VKIE
  from .vube import VubeIE
+from .washingtonpost import WashingtonPostIE
  from .wat import WatIE
  from .wdr import WDRIE
  from .weibo import WeiboIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index 2a123880e62c4154d9772ded5fb6775de50509e2..646377e4b0f16c9b3628987b51c3ff530c12b297 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -2,7 +2,6 @@
  from __future__ import unicode_literals
  
  import re
-import json
  
  from .common import InfoExtractor
  from ..utils import (
@@ -26,8 +25,8 @@ class ArteTvIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
          lang = mobj.group('lang')
+        video_id = mobj.group('id')
  
          ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
          ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py

index 43efb08bfc33accf5661bf0afa3d59aeb1bb0c0e..669919a2cc9039ffb91ae052b96d5531665341e0 100644 (file)
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,22 +1,28 @@
+from __future__ import unicode_literals
+
  import re
  import time
  import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
  
  
  class ClipfishIE(InfoExtractor):
-    IE_NAME = u'clipfish'
+    IE_NAME = 'clipfish'
  
      _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
      _TEST = {
-        u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
-        u'file': u'3966754.mp4',
-        u'md5': u'2521cd644e862936cf2e698206e47385',
-        u'info_dict': {
-            u'title': u'FIFA 14 - E3 2013 Trailer',
-            u'duration': 82,
+        'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+        'md5': '2521cd644e862936cf2e698206e47385',
+        'info_dict': {
+            'id': '3966754',
+            'ext': 'mp4',
+            'title': 'FIFA 14 - E3 2013 Trailer',
+            'duration': 82,
          },
          u'skip': 'Blocked in the US'
      }
@@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
          video_url = doc.find('filename').text
          if video_url is None:
              xml_bytes = xml.etree.ElementTree.tostring(doc)
-            raise ExtractorError(u'Cannot find video URL in document %r' %
+            raise ExtractorError('Cannot find video URL in document %r' %
                                   xml_bytes)
          thumbnail = doc.find('imageurl').text
-        duration_str = doc.find('duration').text
-        m = re.match(
-            r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
-            duration_str)
-        if m:
-            duration = (
-                (int(m.group('hours')) * 60 * 60) +
-                (int(m.group('minutes')) * 60) +
-                (int(m.group('seconds')))
-            )
-        else:
-            duration = None
+        duration = parse_duration(doc.find('duration').text)
  
          return {
              'id': video_id,
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py

index f211637a74fe0056a9553f1ed17da4b9f688f5ba..bcdfbdd5691410e43d3b030f4f8059cfc8497c6a 100644 (file)
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -28,6 +28,7 @@ class RTSIE(InfoExtractor):
              'uploader': 'Divers',
              'upload_date': '19680921',
              'timestamp': -40280400,
+            'thumbnail': 're:^https?://.*\.image'
          },
      }
  
@@ -58,4 +59,5 @@ class RTSIE(InfoExtractor):
              'duration': duration,
              'uploader': info.get('programName'),
              'timestamp': upload_timestamp,
+            'thumbnail': thumbnail,
          }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py

new file mode 100644 (file)

index 0000000..cb8f088
--- /dev/null
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+    _TEST = {
+        'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+        'playlist': [{
+            'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+            'info_dict': {
+                'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'Breaking Points: The Paper Mine',
+                'duration': 1287,
+                'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+                'uploader': 'The Washington Post',
+                'timestamp': 1395527908,
+                'upload_date': '20140322',
+            },
+        }, {
+            'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+            'info_dict': {
+                'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'The town bureaucracy sustains',
+                'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+                'duration': 2217,
+                'timestamp': 1395528005,
+                'upload_date': '20140322',
+                'uploader': 'The Washington Post',
+            },
+        }]
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, page_id)
+        title = self._og_search_title(webpage)
+        uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
+        entries = []
+        for i, uuid in enumerate(uuids, start=1):
+            vinfo_all = self._download_json(
+                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
+                page_id,
+                transform_source=strip_jsonp,
+                note='Downloading information of video %d/%d' % (i, len(uuids))
+            )
+            vinfo = vinfo_all[0]['contentConfig']
+            uploader = vinfo.get('credits', {}).get('source')
+            timestamp = int_or_none(
+                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
+
+            formats = [{
+                'format_id': (
+                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
+                    if s.get('width')
+                    else s.get('type')),
+                'vbr': s.get('bitrate') if s.get('width') != 0 else None,
+                'width': s.get('width'),
+                'height': s.get('height'),
+                'acodec': s.get('audioCodec'),
+                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
+                'filesize': s.get('fileSize'),
+                'url': s.get('url'),
+                'ext': 'mp4',
+                'protocol': {
+                    'MP4': 'http',
+                    'F4F': 'f4m',
+                }.get(s.get('type'))
+            } for s in vinfo.get('streams', [])]
+            source_media_url = vinfo.get('sourceMediaURL')
+            if source_media_url:
+                formats.append({
+                    'format_id': 'source_media',
+                    'url': source_media_url,
+                })
+            self._sort_formats(formats)
+            entries.append({
+                'id': uuid,
+                'title': vinfo['title'],
+                'description': vinfo.get('blurb'),
+                'uploader': uploader,
+                'formats': formats,
+                'duration': int_or_none(vinfo.get('videoDuration'), 100),
+                'timestamp': timestamp,
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': page_id,
+            'title': title,
+        }
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 68d590ba25f812ba96f0c97d81f932200059a155..29c9b1a4cb6fdef970617fd6b556454cb7eff07d 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1186,7 +1186,7 @@ def parse_duration(s):
          return None
  
      m = re.match(
-        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
+        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
      if not m:
          return None
      res = int(m.group('secs'))
@@ -1328,3 +1328,7 @@ US_RATINGS = {
      'R': 16,
      'NC': 18,
  }
+
+
+def strip_jsonp(code):
+    return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 24 Mar 2014 22:23:17 +0000 (23:23 +0100)
test/test_utils.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/clipfish.py		patch \| blob \| history
youtube_dl/extractor/rts.py		patch \| blob \| history
youtube_dl/extractor/washingtonpost.py	[new file with mode: 0644]	patch \| blob
youtube_dl/utils.py		patch \| blob \| history