_ Git - youtube-dl/blob - youtube_dl/extractor/vtv.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5
   6 import re
   7
   8 from ..utils import extract_attributes
   9
  10 class VTVIE(InfoExtractor):
  11     _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?'
  12     _TESTS = [{
  13         # Livestream. Channel: VTV 1
  14         'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1',
  15         'info_dict': {
  16             'id': '1011',
  17             'ext': 'mp4',
  18             'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  19             'thumbnail': r're:https?://.*\.png$',
  20         }
  21     }, {
  22         # Downloading a video.
  23         'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h',
  24         'md5': '5263c63d738569ed507980f1e49ebc03',
  25         'info_dict': {
  26             'id': '109788',
  27             'ext': 'mp4',
  28             'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net',
  29             'thumbnail': r're:https?://.*\.JPG$',
  30         }
  31     }, {
  32         # Radio live stream. Channel: VOV 1
  33         'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014',
  34         'info_dict': {
  35             'id': '1014',
  36             'ext': 'm4a',
  37             'vcodec': 'none',
  38             'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  39             'thumbnail': r're:https?://.*\.png$',
  40         }
  41
  42     }]
  43
  44     def _real_extract(self, url):
  45         video_id = self._match_id(url)
  46         webpage = self._download_webpage(url, video_id)
  47
  48         title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title', default=None, fatal=False)
  49         if title is None:
  50             title = self._og_search_title(webpage)
  51         title.strip()
  52
  53         mediaplayer_div = self._search_regex(r'(<div[^>]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element')
  54         mediaplayer_div_attributes = extract_attributes(mediaplayer_div)
  55
  56         thumbnail = mediaplayer_div_attributes.get("data-image")
  57
  58         json_url = mediaplayer_div_attributes["data-file"]
  59         video_streams = self._download_json(json_url, video_id)
  60
  61
  62         # get any working playlist from streams. Currently there's 2 and the first always works,
  63         # but you never know in the future
  64         for stream in video_streams:
  65             formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False)
  66             if formats:
  67                 break
  68
  69         # better support radio streams
  70         if title.startswith("VOV"):
  71             for f in formats:
  72                 f["ext"] = "m4a"
  73                 f["vcodec"] = "none"
  74
  75         if "/video/" in url or "/radio/" in url:
  76             is_live = False
  77         elif "/kenh-truyen-hinh/" in url:
  78             is_live = True
  79         else:
  80             is_live = None
  81
  82         if is_live:
  83             title = self._live_title(title)
  84
  85         return {
  86             'id': video_id,
  87             'title': title,
  88             'thumbnail': thumbnail,
  89             'formats': formats,
  90             'is_live': is_live,
  91         }