_ Git - youtube-dl/blob - youtube_dl/extractor/techtalks.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     get_element_by_attribute,
   8     clean_html,
   9 )
  10
  11
  12 class TechTalksIE(InfoExtractor):
  13     _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
  14
  15     _TEST = {
  16         'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
  17         'info_dict': {
  18             'id': '57758',
  19             'title': 'Learning Topic Models --- Going beyond SVD',
  20         },
  21         'playlist': [
  22             {
  23                 'info_dict': {
  24                     'id': '57758',
  25                     'ext': 'flv',
  26                     'title': 'Learning Topic Models --- Going beyond SVD',
  27                 },
  28             },
  29             {
  30                 'info_dict': {
  31                     'id': '57758-slides',
  32                     'ext': 'flv',
  33                     'title': 'Learning Topic Models --- Going beyond SVD',
  34                 },
  35             },
  36         ],
  37         'params': {
  38             # rtmp download
  39             'skip_download': True,
  40         },
  41     }
  42
  43     def _real_extract(self, url):
  44         mobj = re.match(self._VALID_URL, url)
  45         talk_id = mobj.group('id')
  46         webpage = self._download_webpage(url, talk_id)
  47         rtmp_url = self._search_regex(
  48             r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
  49         play_path = self._search_regex(
  50             r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
  51             webpage, 'presenter play path')
  52         title = clean_html(get_element_by_attribute('class', 'title', webpage))
  53         video_info = {
  54             'id': talk_id,
  55             'title': title,
  56             'url': rtmp_url,
  57             'play_path': play_path,
  58             'ext': 'flv',
  59         }
  60         m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
  61         if m_slides is None:
  62             return video_info
  63         else:
  64             return {
  65                 '_type': 'playlist',
  66                 'id': talk_id,
  67                 'title': title,
  68                 'entries': [
  69                     video_info,
  70                     # The slides video
  71                     {
  72                         'id': talk_id + '-slides',
  73                         'title': title,
  74                         'url': rtmp_url,
  75                         'play_path': m_slides.group(1),
  76                         'ext': 'flv',
  77                     },
  78                 ],
  79             }