]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/closertotruth.py
[pornhub] Extract categories and tags (closes #10499)
[youtube-dl] / youtube_dl / extractor / closertotruth.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7
8
9 class CloserToTruthIE(InfoExtractor):
10     _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
11     _TESTS = [{
12         'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
13         'info_dict': {
14             'id': '0_zof1ktre',
15             'display_id': 'solutions-the-mind-body-problem',
16             'ext': 'mov',
17             'title': 'Solutions to the Mind-Body Problem?',
18             'upload_date': '20140221',
19             'timestamp': 1392956007,
20             'uploader_id': 'CTTXML'
21         },
22         'params': {
23             'skip_download': True,
24         },
25     }, {
26         'url': 'http://closertotruth.com/episodes/how-do-brains-work',
27         'info_dict': {
28             'id': '0_iuxai6g6',
29             'display_id': 'how-do-brains-work',
30             'ext': 'mov',
31             'title': 'How do Brains Work?',
32             'upload_date': '20140221',
33             'timestamp': 1392956024,
34             'uploader_id': 'CTTXML'
35         },
36         'params': {
37             'skip_download': True,
38         },
39     }, {
40         'url': 'http://closertotruth.com/interviews/1725',
41         'info_dict': {
42             'id': '1725',
43             'title': 'AyaFr-002',
44         },
45         'playlist_mincount': 2,
46     }]
47
48     def _real_extract(self, url):
49         display_id = self._match_id(url)
50
51         webpage = self._download_webpage(url, display_id)
52
53         partner_id = self._search_regex(
54             r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
55             webpage, 'kaltura partner_id')
56
57         title = self._search_regex(
58             r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
59
60         select = self._search_regex(
61             r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
62             webpage, 'select version', default=None)
63         if select:
64             entry_ids = set()
65             entries = []
66             for mobj in re.finditer(
67                     r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
68                     webpage):
69                 entry_id = mobj.group('id')
70                 if entry_id in entry_ids:
71                     continue
72                 entry_ids.add(entry_id)
73                 entries.append({
74                     '_type': 'url_transparent',
75                     'url': 'kaltura:%s:%s' % (partner_id, entry_id),
76                     'ie_key': 'Kaltura',
77                     'title': mobj.group('title'),
78                 })
79             if entries:
80                 return self.playlist_result(entries, display_id, title)
81
82         entry_id = self._search_regex(
83             r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
84             webpage, 'kaltura entry_id', group='id')
85
86         return {
87             '_type': 'url_transparent',
88             'display_id': display_id,
89             'url': 'kaltura:%s:%s' % (partner_id, entry_id),
90             'ie_key': 'Kaltura',
91             'title': title
92         }