Merge pull request #5556 from jaimeMF/best-format-nodash
[youtube-dl] / youtube_dl / extractor / veehd.py
1 from __future__ import unicode_literals
2
3 import re
4 import json
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urlparse,
9 )
10 from ..utils import (
11     ExtractorError,
12     clean_html,
13     get_element_by_id,
14 )
15
16
17 class VeeHDIE(InfoExtractor):
18     _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
19
20     # Seems VeeHD videos have multiple copies on several servers, all of
21     # whom have different MD5 checksums, so omit md5 field in all tests
22     _TESTS = [{
23         'url': 'http://veehd.com/video/4639434_Solar-Sinter',
24         'info_dict': {
25             'id': '4639434',
26             'ext': 'mp4',
27             'title': 'Solar Sinter',
28             'uploader_id': 'VideoEyes',
29             'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
30         },
31         'skip': 'Video deleted',
32     }, {
33         'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling',
34         'info_dict': {
35             'id': '4905758',
36             'ext': 'mp4',
37             'title': 'Elysian Fields - Channeling',
38             'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b',
39             'uploader_id': 'spotted',
40         }
41     }, {
42         'url': 'http://veehd.com/video/4665804_Tell-No-One-Ne-le-dis-a-personne-2006-French-EngSoftSubs-Re-Up',
43         'info_dict': {
44             'id': '4665804',
45             'ext': 'avi',
46             'title': 'Tell No One (Ne le dis a personne) 2006 French(EngSoftSubs) Re-Up',
47             'description': 'md5:d660cca685549776f37165e9a10b60ba',
48             'uploader_id': 'belial2549',
49         }
50     }]
51
52     def _real_extract(self, url):
53         video_id = self._match_id(url)
54
55         # VeeHD seems to send garbage on the first request.
56         # See https://github.com/rg3/youtube-dl/issues/2102
57         self._download_webpage(url, video_id, 'Requesting webpage')
58         webpage = self._download_webpage(url, video_id)
59
60         if 'This video has been removed<' in webpage:
61             raise ExtractorError('Video %s has been removed' % video_id, expected=True)
62
63         player_path = self._search_regex(
64             r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
65             webpage, 'player path')
66         player_url = compat_urlparse.urljoin(url, player_path)
67
68         self._download_webpage(player_url, video_id, 'Requesting player page')
69         player_page = self._download_webpage(
70             player_url, video_id, 'Downloading player page')
71
72         video_url = None
73
74         config_json = self._search_regex(
75             r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
76
77         if config_json:
78             config = json.loads(config_json)
79             video_url = compat_urlparse.unquote(config['clip']['url'])
80
81         if not video_url:
82             video_url = self._html_search_regex(
83                 r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"',
84                 player_page, 'video url', default=None)
85
86         if not video_url:
87             iframe_src = self._search_regex(
88                 r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
89             iframe_url = 'http://veehd.com/%s' % iframe_src
90
91             self._download_webpage(iframe_url, video_id, 'Requesting iframe page')
92             iframe_page = self._download_webpage(
93                 iframe_url, video_id, 'Downloading iframe page')
94
95             video_url = self._search_regex(
96                 r"file\s*:\s*'([^']+)'", iframe_page, 'video url')
97
98         title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
99         uploader_id = self._html_search_regex(
100             r'<a href="/profile/\d+">(.+?)</a>',
101             webpage, 'uploader')
102         thumbnail = self._search_regex(
103             r'<img id="veehdpreview" src="(.+?)"',
104             webpage, 'thumbnail')
105         description = self._html_search_regex(
106             r'<td class="infodropdown".*?<div>(.*?)<ul',
107             webpage, 'description', flags=re.DOTALL)
108
109         return {
110             '_type': 'video',
111             'id': video_id,
112             'title': title,
113             'url': video_url,
114             'uploader_id': uploader_id,
115             'thumbnail': thumbnail,
116             'description': description,
117         }