Merge branch 'douyutv' of https://github.com/bonfy/youtube-dl into bonfy-douyutv
[youtube-dl] / youtube_dl / extractor / primesharetv.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..compat import (
7     compat_urllib_parse,
8     compat_urllib_request,
9 )
10 from ..utils import ExtractorError
11
12
13 class PrimeShareTVIE(InfoExtractor):
14     _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)'
15
16     _TEST = {
17         'url': 'http://primeshare.tv/download/238790B611',
18         'md5': 'b92d9bf5461137c36228009f31533fbc',
19         'info_dict': {
20             'id': '238790B611',
21             'ext': 'mp4',
22             'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona',
23         },
24     }
25
26     def _real_extract(self, url):
27         video_id = self._match_id(url)
28
29         webpage = self._download_webpage(url, video_id)
30
31         if '>File not exist<' in webpage:
32             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
33
34         fields = dict(re.findall(r'''(?x)<input\s+
35             type="hidden"\s+
36             name="([^"]+)"\s+
37             (?:id="[^"]+"\s+)?
38             value="([^"]*)"
39             ''', webpage))
40
41         headers = {
42             'Referer': url,
43             'Content-Type': 'application/x-www-form-urlencoded',
44         }
45
46         wait_time = int(self._search_regex(
47             r'var\s+cWaitTime\s*=\s*(\d+)',
48             webpage, 'wait time', default=7)) + 1
49         self._sleep(wait_time, video_id)
50
51         req = compat_urllib_request.Request(
52             url, compat_urllib_parse.urlencode(fields), headers)
53         video_page = self._download_webpage(
54             req, video_id, 'Downloading video page')
55
56         video_url = self._search_regex(
57             r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'",
58             video_page, 'video url')
59
60         title = self._html_search_regex(
61             r'<h1>Watch\s*(?:&nbsp;)?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?:&nbsp;)?\s*<strong>',
62             video_page, 'title')
63
64         return {
65             'id': video_id,
66             'url': video_url,
67             'title': title,
68             'ext': 'mp4',
69         }