[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / thisav.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import remove_end
8
9
10 class ThisAVIE(InfoExtractor):
11     _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
12     _TESTS = [{
13         # jwplayer
14         'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
15         'md5': '0480f1ef3932d901f0e0e719f188f19b',
16         'info_dict': {
17             'id': '47734',
18             'ext': 'flv',
19             'title': '高樹マリア - Just fit',
20             'uploader': 'dj7970',
21             'uploader_id': 'dj7970'
22         }
23     }, {
24         # html5 media
25         'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html',
26         'md5': 'ba90c076bd0f80203679e5b60bf523ee',
27         'info_dict': {
28             'id': '242352',
29             'ext': 'mp4',
30             'title': 'Nerdy 18yo Big Ass Tattoos and Glasses',
31             'uploader': 'cybersluts',
32             'uploader_id': 'cybersluts',
33         },
34     }]
35
36     def _real_extract(self, url):
37         mobj = re.match(self._VALID_URL, url)
38
39         video_id = mobj.group('id')
40         webpage = self._download_webpage(url, video_id)
41         title = remove_end(self._html_search_regex(
42             r'<title>([^<]+)</title>', webpage, 'title'),
43             ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
44         video_url = self._html_search_regex(
45             r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
46         if video_url:
47             info_dict = {
48                 'formats': [{
49                     'url': video_url,
50                 }],
51             }
52         else:
53             entries = self._parse_html5_media_entries(url, webpage, video_id)
54             if entries:
55                 info_dict = entries[0]
56             else:
57                 info_dict = self._extract_jwplayer_data(
58                     webpage, video_id, require_title=False)
59         uploader = self._html_search_regex(
60             r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
61             webpage, 'uploader name', fatal=False)
62         uploader_id = self._html_search_regex(
63             r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
64             webpage, 'uploader id', fatal=False)
65
66         info_dict.update({
67             'id': video_id,
68             'uploader': uploader,
69             'uploader_id': uploader_id,
70             'title': title,
71         })
72
73         return info_dict