[wistia] Add extractor

author Philipp Hagemeister <phihag@phihag.de>

Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 8ad7bd1dac65a7cc932c46d9368a607d9d112819..07b36a98ea6d229c6044c94f1fdffecce2dae445 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -488,7 +488,8 @@ class YoutubeDL(object):
                  new_result = ie_result.copy()
                  for f in ('_type', 'url', 'ext', 'player_url', 'formats',
                            'entries', 'urlhandle', 'ie_key', 'duration',
-                          'subtitles', 'annotations', 'format'):
+                          'subtitles', 'annotations', 'format',
+                          'thumbnail', 'thumbnails'):
                      if f in new_result:
                          del new_result[f]
                      if f in embedded_info:
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index a78dcad7fd93c438904549e376a1999f4d84ab0c..a7d37d48b49b815b6dba4031cce2e9650381b6fd 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -178,6 +178,7 @@ from .wat import WatIE
  from .websurg import WeBSurgIE
  from .weibo import WeiboIE
  from .wimp import WimpIE
+from .wistia import WistiaIE
  from .worldstarhiphop import WorldStarHipHopIE
  from .xhamster import XHamsterIE
  from .xnxx import XNXXIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 10ae06263ef1349ff6526575feb61cb71591f3cc..216e032186297b7b91a488fb1edd1421e3270b39 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
          #   Site Name | Video Title
          #   Video Title - Tagline | Site Name
          # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
+        video_title = self._html_search_regex(
+            r'(?s)<title>(.*?)</title>', webpage, u'video title',
+            default=u'video')
+
+        # video uploader is domain name
+        video_uploader = self._search_regex(
+            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
  
          # Look for BrightCove:
          bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@@ -188,7 +193,7 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded YouTube player
          matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
          if matches:
              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
                       for tuppl in matches]
@@ -197,13 +202,26 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded Dailymotion player
          matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
          if matches:
              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
                       for tuppl in matches]
              return self.playlist_result(
                  urlrs, playlist_id=video_id, playlist_title=video_title)
  
+        # Look for embedded Wistia player
+        match = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+        if match:
+            return {
+                '_type': 'url_transparent',
+                'url': unescapeHTML(match.group('url')),
+                'ie_key': 'Wistia',
+                'uploader': video_uploader,
+                'title': video_title,
+                'id': video_id,
+            }
+
          # Look for Bandcamp pages with custom domain
          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
          if mobj is not None:
@@ -247,14 +265,9 @@ class GenericIE(InfoExtractor):
          # here's a fun little line of code for you:
          video_id = os.path.splitext(video_id)[0]
  
-        # video uploader is domain name
-        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
-            url, u'video uploader')
-
          return {
              'id':       video_id,
              'url':      video_url,
              'uploader': video_uploader,
-            'upload_date':  None,
              'title':    video_title,
          }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py

new file mode 100644 (file)

index 0000000..e1748c2
--- /dev/null
+++ b/youtube_dl/extractor/wistia.py
@@ -0,0 +1,55 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class WistiaIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
+
+    _TEST = {
+        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
+        u"file": u"sh7fpupwlt.mov",
+        u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
+        u"info_dict": {
+            u"title": u"cfh_resourceful_zdkh_final_1"
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        data_json = self._html_search_regex(
+            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
+
+        data = json.loads(data_json)
+
+        formats = []
+        thumbnails = []
+        for atype, a in data['assets'].items():
+            if atype == 'still':
+                thumbnails.append({
+                    'url': a['url'],
+                    'resolution': '%dx%d' % (a['width'], a['height']),
+                })
+                continue
+            if atype == 'preview':
+                continue
+            formats.append({
+                'format_id': atype,
+                'url': a['url'],
+                'width': a['width'],
+                'height': a['height'],
+                'filesize': a['size'],
+                'ext': a['ext'],
+            })
+        formats.sort(key=lambda a: a['filesize'])
+
+        return {
+            'id': video_id,
+            'title': data['name'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
author	Philipp Hagemeister <phihag@phihag.de>
	Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Fri, 6 Dec 2013 08:15:04 +0000 (09:15 +0100)
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/wistia.py	[new file with mode: 0644]	patch \| blob