[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / techtalks.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     get_element_by_attribute,
8     clean_html,
9 )
10
11
12 class TechTalksIE(InfoExtractor):
13     _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)'
14
15     _TESTS = [{
16         'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
17         'info_dict': {
18             'id': '57758',
19             'title': 'Learning Topic Models --- Going beyond SVD',
20         },
21         'playlist': [
22             {
23                 'info_dict': {
24                     'id': '57758',
25                     'ext': 'flv',
26                     'title': 'Learning Topic Models --- Going beyond SVD',
27                 },
28             },
29             {
30                 'info_dict': {
31                     'id': '57758-slides',
32                     'ext': 'flv',
33                     'title': 'Learning Topic Models --- Going beyond SVD',
34                 },
35             },
36         ],
37         'params': {
38             # rtmp download
39             'skip_download': True,
40         },
41     }, {
42         'url': 'http://techtalks.tv/talks/57758',
43         'only_matching': True,
44     }]
45
46     def _real_extract(self, url):
47         mobj = re.match(self._VALID_URL, url)
48         talk_id = mobj.group('id')
49         webpage = self._download_webpage(url, talk_id)
50         rtmp_url = self._search_regex(
51             r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
52         play_path = self._search_regex(
53             r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
54             webpage, 'presenter play path')
55         title = clean_html(get_element_by_attribute('class', 'title', webpage))
56         video_info = {
57             'id': talk_id,
58             'title': title,
59             'url': rtmp_url,
60             'play_path': play_path,
61             'ext': 'flv',
62         }
63         m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
64         if m_slides is None:
65             return video_info
66         else:
67             return {
68                 '_type': 'playlist',
69                 'id': talk_id,
70                 'title': title,
71                 'entries': [
72                     video_info,
73                     # The slides video
74                     {
75                         'id': talk_id + '-slides',
76                         'title': title,
77                         'url': rtmp_url,
78                         'play_path': m_slides.group(1),
79                         'ext': 'flv',
80                     },
81                 ],
82             }