[tudou] Use single quotes and compat_str
[youtube-dl] / youtube_dl / extractor / tudou.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5 from .common import InfoExtractor
6 from ..compat import compat_str
7
8
9 class TudouIE(InfoExtractor):
10     _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
11     _TESTS = [{
12         'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
13         'md5': '140a49ed444bd22f93330985d8475fcb',
14         'info_dict': {
15             'id': '159448201',
16             'ext': 'f4v',
17             'title': '卡马乔国足开大脚长传冲吊集锦',
18             'thumbnail': 're:^https?://.*\.jpg$',
19         }
20     }, {
21         'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
22         'info_dict': {
23             'id': '117049447',
24             'ext': 'f4v',
25             'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
26             'thumbnail': 're:^https?://.*\.jpg$',
27         }
28     }, {
29         'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
30         'only_matching': True,
31     }]
32
33     _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
34
35     def _url_for_id(self, video_id, quality=None):
36         info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
37         if quality:
38             info_url += '&hd' + quality
39         xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
40         final_url = xml_data.text
41         return final_url
42
43     def _real_extract(self, url):
44         video_id = self._match_id(url)
45         webpage = self._download_webpage(url, video_id)
46
47         youku_vcode = self._search_regex(
48             r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
49         if youku_vcode:
50             return self.url_result('youku:' + youku_vcode, ie='Youku')
51
52         title = self._search_regex(
53             r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
54         thumbnail_url = self._search_regex(
55             r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
56
57         player_url = self._search_regex(
58             r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
59             webpage, 'player URL', default=self._PLAYER_URL)
60
61         segments = self._parse_json(self._search_regex(
62             r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
63         # It looks like the keys are the arguments that have to be passed as
64         # the hd field in the request url, we pick the higher
65         # Also, filter non-number qualities (see issue #3643).
66         quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
67                          key=lambda k: int(k))[-1]
68         parts = segments[quality]
69         result = []
70         len_parts = len(parts)
71         if len_parts > 1:
72             self.to_screen('%s: found %s parts' % (video_id, len_parts))
73         for part in parts:
74             part_id = part['k']
75             final_url = self._url_for_id(part_id, quality)
76             ext = (final_url.split('?')[0]).split('.')[-1]
77             part_info = {
78                 'id': '%s' % part_id,
79                 'url': final_url,
80                 'ext': ext,
81                 'title': title,
82                 'thumbnail': thumbnail_url,
83                 'http_headers': {
84                     'Referer': player_url,
85                 },
86             }
87             result.append(part_info)
88
89         return {
90             '_type': 'multi_video',
91             'entries': result,
92             'id': video_id,
93             'title': title,
94         }