_ Git - youtube-dl/blob - youtube_dl/extractor/tvnet.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_str
   8 from ..utils import (
   9     int_or_none,
  10     unescapeHTML,
  11 )
  12
  13
  14 class TVNetIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P<id>[0-9]+)'
  16     _TESTS = [{
  17         # video
  18         'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h',
  19         'md5': 'b4d7abe0252c9b47774760b7519c7558',
  20         'info_dict': {
  21             'id': '109788',
  22             'ext': 'mp4',
  23             'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang',
  24             'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  25             'is_live': False,
  26             'view_count': int,
  27         },
  28     }, {
  29         # audio
  30         'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi',
  31         'md5': 'b5875ce9b0a2eecde029216d0e6db2ae',
  32         'info_dict': {
  33             'id': '27017',
  34             'ext': 'm4a',
  35             'title': 'VOV1 - Bản tin chiều (10/06/2018)',
  36             'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  37             'is_live': False,
  38         },
  39     }, {
  40         # live stream
  41         'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1',
  42         'info_dict': {
  43             'id': '1011',
  44             'ext': 'mp4',
  45             'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  46             'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  47             'is_live': True,
  48         },
  49         'params': {
  50             'skip_download': True,
  51         },
  52     }, {
  53         # radio live stream
  54         'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014',
  55         'info_dict': {
  56             'id': '1014',
  57             'ext': 'm4a',
  58             'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  59             'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  60             'is_live': True,
  61         },
  62         'params': {
  63             'skip_download': True,
  64         },
  65     }]
  66
  67     def _real_extract(self, url):
  68         video_id = self._match_id(url)
  69
  70         webpage = self._download_webpage(url, video_id)
  71
  72         title = self._og_search_title(
  73             webpage, default=None) or self._html_search_meta(
  74             'title', webpage, default=None) or self._search_regex(
  75             r'<title>([^<]+)<', webpage, 'title')
  76         title = re.sub(r'\s*-\s*TV Net\s*$', '', title)
  77
  78         if '/video/' in url or '/radio/' in url:
  79             is_live = False
  80         elif '/kenh-truyen-hinh/' in url:
  81             is_live = True
  82         else:
  83             is_live = None
  84
  85         data_file = unescapeHTML(self._search_regex(
  86             r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
  87             'data file', group='url'))
  88
  89         stream_urls = set()
  90         formats = []
  91         for stream in self._download_json(data_file, video_id):
  92             if not isinstance(stream, dict):
  93                 continue
  94             stream_url = stream.get('url')
  95             if (stream_url in stream_urls or not stream_url or
  96                     not isinstance(stream_url, compat_str)):
  97                 continue
  98             stream_urls.add(stream_url)
  99             formats.extend(self._extract_m3u8_formats(
 100                 stream_url, video_id, 'mp4',
 101                 entry_protocol='m3u8' if is_live else 'm3u8_native',
 102                 m3u8_id='hls', fatal=False))
 103         self._sort_formats(formats)
 104
 105         # better support for radio streams
 106         if title.startswith('VOV'):
 107             for f in formats:
 108                 f.update({
 109                     'ext': 'm4a',
 110                     'vcodec': 'none',
 111                 })
 112
 113         thumbnail = self._og_search_thumbnail(
 114             webpage, default=None) or unescapeHTML(
 115             self._search_regex(
 116                 r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
 117                 'thumbnail', default=None, group='url'))
 118
 119         if is_live:
 120             title = self._live_title(title)
 121
 122         view_count = int_or_none(self._search_regex(
 123             r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>',
 124             webpage, 'view count', default=None))
 125
 126         return {
 127             'id': video_id,
 128             'title': title,
 129             'thumbnail': thumbnail,
 130             'is_live': is_live,
 131             'view_count': view_count,
 132             'formats': formats,
 133         }