add an extractor for tv.sohu.com
authorhuohuarong <huohuarong@gmail.com>
Fri, 2 Aug 2013 09:58:46 +0000 (17:58 +0800)
committerhuohuarong <huohuarong@gmail.com>
Fri, 2 Aug 2013 09:58:46 +0000 (17:58 +0800)
youtube_dl/extractor/__init__.py
youtube_dl/extractor/sohu.py [new file with mode: 0644]

index c20172a53a0372c09810b1a0ba1c0d99c8899d7c..3a08d676fb40db39f09f1c2eb09618b0516552f0 100644 (file)
@@ -55,6 +55,7 @@ from .redtube import RedTubeIE
 from .ringtv import RingTVIE
 from .roxwel import RoxwelIE
 from .sina import SinaIE
 from .ringtv import RingTVIE
 from .roxwel import RoxwelIE
 from .sina import SinaIE
+from .sohu import SohuIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
new file mode 100644 (file)
index 0000000..8308142
--- /dev/null
@@ -0,0 +1,97 @@
+# encoding: utf-8
+
+import re
+import json
+import time
+import logging
+import urllib2
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_request
+
+
+class SohuIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
+
+    _TEST = {
+        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
+        u'file': u'382479172.flv',
+        u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
+        u'info_dict': {
+            u'title': u'The Illest - Far East Movement Riff Raff',
+        },
+    }
+
+    def _clearn_html(self, string):
+        tags = re.findall(r'<.+?>', string)
+        for t in tags:
+            string = string.replace(t, ' ')
+        for i in range(2):
+            spaces = re.findall(r'\s+', string)
+            for s in spaces:
+                string = string.replace(s, ' ')
+        string = string.strip()
+        return string
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
+        compiled = re.compile(pattern, re.DOTALL)
+        title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
+        title = self._clearn_html(title)
+        pattern = re.compile(r'var vid="(\d+)"')
+        result = re.search(pattern, webpage)
+        if not result:
+            logging.info('[Sohu] could not get vid')
+            return None
+        vid = result.group(1)
+        logging.info('vid: %s' % vid)
+        base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+        url_1 = base_url_1 + vid
+        logging.info('json url: %s' % url_1)
+        json_1 = json.loads(urllib2.urlopen(url_1).read())
+        # get the highest definition video vid and json infomation.
+        vids = []
+        qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
+        for vid_name in qualities:
+            vids.append(json_1['data'][vid_name])
+        clearest_vid = 0
+        for i, v in enumerate(vids):
+            if v != 0:
+                clearest_vid = v
+                logging.info('quality definition: %s' % qualities[i][:-3])
+                break
+        if not clearest_vid:
+            logging.warning('could not find valid clearest_vid')
+            return None
+        if vid != clearest_vid:
+            url_1 = '%s%d' % (base_url_1, clearest_vid)
+            logging.info('highest definition json url: %s' % url_1)
+            json_1 = json.loads(urllib2.urlopen(url_1).read())
+        allot = json_1['allot']
+        prot = json_1['prot']
+        clipsURL = json_1['data']['clipsURL']
+        su = json_1['data']['su']
+        num_of_parts = json_1['data']['totalBlocks']
+        logging.info('Total parts: %d' % num_of_parts)
+        base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
+        files_info = []
+        for i in range(num_of_parts):
+            middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
+            logging.info('middle url part %d: %s' % (i, middle_url))
+            middle_info = urllib2.urlopen(middle_url).read().split('|')
+            middle_part_1 = middle_info[0]
+            download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
+
+            info = {
+                'id': '%s_part%02d' % (video_id, i + 1),
+                'title': title,
+                'url': download_url,
+                'ext': 'mp4',
+            }
+            files_info.append(info)
+            time.sleep(1)
+
+        return files_info