compiled_regex_type,
determine_ext,
determine_protocol,
+ dict_get,
error_to_compat_str,
ExtractorError,
extract_attributes,
JSON_LD_RE,
mimetype2ext,
orderedSet,
+ parse_bitrate,
parse_codecs,
parse_duration,
parse_iso8601,
parse_m3u8_attributes,
+ parse_resolution,
RegexNotFoundError,
sanitized_Request,
sanitize_filename,
+ str_or_none,
unescapeHTML,
unified_strdate,
unified_timestamp,
for RTMP - RTMP URL,
for HLS - URL of the M3U8 media playlist,
for HDS - URL of the F4M manifest,
- for DASH - URL of the MPD manifest or
- base URL representing the media
- if MPD manifest is parsed from
- a string,
+ for DASH
+ - HTTP URL to plain file media (in case of
+ unfragmented media)
+ - URL of the MPD manifest or base URL
+ representing the media if MPD manifest
+ is parsed froma string (in case of
+ fragmented media)
for MSS - URL of the ISM manifest.
* manifest_url
The URL of the manifest file in case of
raise ExtractorError('An extractor error has occurred.', cause=e)
def __maybe_fake_ip_and_retry(self, countries):
- if (not self._downloader.params.get('geo_bypass_country', None) and
- self._GEO_BYPASS and
- self._downloader.params.get('geo_bypass', True) and
- not self._x_forwarded_for_ip and
- countries):
+ if (not self._downloader.params.get('geo_bypass_country', None)
+ and self._GEO_BYPASS
+ and self._downloader.params.get('geo_bypass', True)
+ and not self._x_forwarded_for_ip
+ and countries):
country_code = random.choice(countries)
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._x_forwarded_for_ip:
def __check_blocked(self, content):
first_block = content[:512]
- if ('<title>Access to this site is blocked</title>' in content and
- 'Websense' in first_block):
+ if ('<title>Access to this site is blocked</title>' in content
+ and 'Websense' in first_block):
msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
blocked_iframe = self._html_search_regex(
r'<iframe src="([^"]+)"', content,
if block_msg:
msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
raise ExtractorError(msg, expected=True)
- if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
- 'blocklist.rkn.gov.ru' in content):
+ if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
+ and 'blocklist.rkn.gov.ru' in content):
raise ExtractorError(
'Access to this webpage has been blocked by decision of the Russian government. '
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
continue
else:
tbr = float_or_none(
- last_stream_inf.get('AVERAGE-BANDWIDTH') or
- last_stream_inf.get('BANDWIDTH'), scale=1000)
+ last_stream_inf.get('AVERAGE-BANDWIDTH')
+ or last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
if res is False:
return []
mpd_doc, urlh = res
+ if mpd_doc is None:
+ return []
mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats(
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
- # NB: mpd_url may be empty when MPD manifest is parsed from a string
- 'url': mpd_url or base_url,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
fragment['duration'] = segment_duration
fragments.append(fragment)
representation_ms_info['fragments'] = fragments
- # NB: MPD manifest may contain direct URLs to unfragmented media.
- # No fragments key is present in this case.
+ # If there is a fragments key available then we correctly recognized fragmented media.
+ # Otherwise we will assume unfragmented media with direct access. Technically, such
+ # assumption is not necessarily correct since we may simply have no support for
+ # some forms of fragmented media renditions yet, but for now we'll use this fallback.
if 'fragments' in representation_ms_info:
f.update({
+ # NB: mpd_url may be empty when MPD manifest is parsed from a string
+ 'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
+ else:
+ # Assuming direct URL to unfragmented media.
+ f['url'] = base_url
+
# According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
# is not necessarily unique within a Period thus formats with
# the same `format_id` are quite possible. There are numerous examples
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
- source_attributes = extract_attributes(source_tag)
- src = source_attributes.get('src')
+ s_attr = extract_attributes(source_tag)
+ # data-video-src and data-src are non standard but seen
+ # several times in the wild
+ src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))
if not src:
continue
- f = parse_content_type(source_attributes.get('type'))
+ f = parse_content_type(s_attr.get('type'))
is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
- # res attribute is not standard but seen several times
- # in the wild
+ # width, height, res, label and title attributes are
+ # all not standard but seen several times in the wild
+ labels = [
+ s_attr.get(lbl)
+ for lbl in ('label', 'title')
+ if str_or_none(s_attr.get(lbl))
+ ]
+ width = int_or_none(s_attr.get('width'))
+ height = (int_or_none(s_attr.get('height'))
+ or int_or_none(s_attr.get('res')))
+ if not width or not height:
+ for lbl in labels:
+ resolution = parse_resolution(lbl)
+ if not resolution:
+ continue
+ width = width or resolution.get('width')
+ height = height or resolution.get('height')
+ for lbl in labels:
+ tbr = parse_bitrate(lbl)
+ if tbr:
+ break
+ else:
+ tbr = None
f.update({
- 'height': int_or_none(source_attributes.get('res')),
- 'format_id': source_attributes.get('label'),
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'format_id': s_attr.get('label') or s_attr.get('title'),
})
f.update(formats[0])
media_info['formats'].append(f)
return not any_restricted
def extract_subtitles(self, *args, **kwargs):
- if (self._downloader.params.get('writesubtitles', False) or
- self._downloader.params.get('listsubtitles')):
+ if (self._downloader.params.get('writesubtitles', False)
+ or self._downloader.params.get('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
return ret
def extract_automatic_captions(self, *args, **kwargs):
- if (self._downloader.params.get('writeautomaticsub', False) or
- self._downloader.params.get('listsubtitles')):
+ if (self._downloader.params.get('writeautomaticsub', False)
+ or self._downloader.params.get('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self._downloader.params.get('mark_watched', False) and
- (self._get_login_info()[0] is not None or
- self._downloader.params.get('cookiefile') is not None)):
+ if (self._downloader.params.get('mark_watched', False)
+ and (self._get_login_info()[0] is not None
+ or self._downloader.params.get('cookiefile') is not None)):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):