2 from __future__ import unicode_literals
17 from ..compat import (
20 compat_etree_fromstring,
26 compat_urllib_parse_unquote,
27 compat_urllib_parse_urlencode,
28 compat_urllib_request,
30 compat_xml_parse_error,
32 from ..downloader.f4m import (
34 remove_encrypted_media,
59 parse_m3u8_attributes,
76 class InfoExtractor(object):
77 """Information Extractor class.
79 Information extractors are the classes that, given a URL, extract
80 information about the video (or videos) the URL refers to. This
81 information includes the real video URL, the video title, author and
82 others. The information is stored in a dictionary which is then
83 passed to the YoutubeDL. The YoutubeDL processes this
84 information possibly downloading the video to the file system, among
85 other possible outcomes.
87 The type field determines the type of the result.
88 By far the most common value (and the default if _type is missing) is
89 "video", which indicates a single video.
91 For a video, the dictionaries must include the following fields:
94 title: Video title, unescaped.
96 Additionally, it must contain either a formats entry or a url one:
98 formats: A list of dictionaries for each format available, ordered
99 from worst to best quality.
102 * url Mandatory. The URL of the video file
104 The URL of the manifest file in case of
105 fragmented media (DASH, hls, hds)
106 * ext Will be calculated from URL if missing
107 * format A human-readable description of the format
108 ("mp4 container with h264/opus").
109 Calculated from the format_id, width, height.
110 and format_note fields if missing.
111 * format_id A short description of the format
112 ("mp4_h264_opus" or "19").
113 Technically optional, but strongly recommended.
114 * format_note Additional info about the format
115 ("3D" or "DASH video")
116 * width Width of the video, if known
117 * height Height of the video, if known
118 * resolution Textual description of width and height
119 * tbr Average bitrate of audio and video in KBit/s
120 * abr Average audio bitrate in KBit/s
121 * acodec Name of the audio codec in use
122 * asr Audio sampling rate in Hertz
123 * vbr Average video bitrate in KBit/s
125 * vcodec Name of the video codec in use
126 * container Name of the container format
127 * filesize The number of bytes, if known in advance
128 * filesize_approx An estimate for the number of bytes
129 * player_url SWF Player URL (used for rtmpdump).
130 * protocol The protocol that will be used for the actual
131 download, lower-case.
132 "http", "https", "rtsp", "rtmp", "rtmpe",
133 "m3u8", "m3u8_native" or "http_dash_segments".
135 Base URL for fragments. Each fragment's path
136 value (if present) will be relative to
138 * fragments A list of fragments of a fragmented media.
139 Each fragment entry must contain either an url
140 or a path. If an url is present it should be
141 considered by a client. Otherwise both path and
142 fragment_base_url must be present. Here is
143 the list of all potential fields:
144 * "url" - fragment's URL
145 * "path" - fragment's path relative to
147 * "duration" (optional, int or float)
148 * "filesize" (optional, int)
149 * preference Order number of this format. If this field is
150 present and not None, the formats get sorted
151 by this field, regardless of all other values.
152 -1 for default (order by other properties),
153 -2 or smaller for less than default.
154 < -1000 to hide the format (if there is
155 another one which is strictly better)
156 * language Language code, e.g. "de" or "en-US".
157 * language_preference Is this in the language mentioned in
159 10 if it's what the URL is about,
160 -1 for default (don't know),
161 -10 otherwise, other values reserved for now.
162 * quality Order number of the video quality of this
163 format, irrespective of the file format.
164 -1 for default (order by other properties),
165 -2 or smaller for less than default.
166 * source_preference Order number for this video source
167 (quality takes higher priority)
168 -1 for default (order by other properties),
169 -2 or smaller for less than default.
170 * http_headers A dictionary of additional HTTP headers
171 to add to the request.
172 * stretched_ratio If given and not 1, indicates that the
173 video's pixels are not square.
174 width : height ratio as float.
175 * no_resume The server does not support resuming the
176 (HTTP or RTMP) download. Boolean.
177 * downloader_options A dictionary of downloader options as
178 described in FileDownloader
180 url: Final video URL.
181 ext: Video filename extension.
182 format: The video format, defaults to ext (used for --get-format)
183 player_url: SWF Player URL (used for rtmpdump).
185 The following fields are optional:
187 alt_title: A secondary title of the video.
188 display_id An alternative identifier for the video, not necessarily
189 unique, but available before title. Typically, id is
190 something like "4234987", title "Dancing naked mole rats",
191 and display_id "dancing-naked-mole-rats"
192 thumbnails: A list of dictionaries, with the following entries:
193 * "id" (optional, string) - Thumbnail format ID
195 * "preference" (optional, int) - quality of the image
196 * "width" (optional, int)
197 * "height" (optional, int)
198 * "resolution" (optional, string "{width}x{height"},
200 * "filesize" (optional, int)
201 thumbnail: Full URL to a video thumbnail image.
202 description: Full video description.
203 uploader: Full name of the video uploader.
204 license: License name the video is licensed under.
205 creator: The creator of the video.
206 release_date: The date (YYYYMMDD) when the video was released.
207 timestamp: UNIX timestamp of the moment the video became available.
208 upload_date: Video upload date (YYYYMMDD).
209 If not explicitly set, calculated from timestamp.
210 uploader_id: Nickname or id of the video uploader.
211 uploader_url: Full URL to a personal webpage of the video uploader.
212 location: Physical location where the video was filmed.
213 subtitles: The available subtitles as a dictionary in the format
214 {tag: subformats}. "tag" is usually a language code, and
215 "subformats" is a list sorted from lower to higher
216 preference, each element is a dictionary with the "ext"
218 * "data": The subtitles file contents
219 * "url": A URL pointing to the subtitles file
220 "ext" will be calculated from URL if missing
221 automatic_captions: Like 'subtitles', used by the YoutubeIE for
222 automatically generated captions
223 duration: Length of the video in seconds, as an integer or float.
224 view_count: How many users have watched the video on the platform.
225 like_count: Number of positive ratings of the video
226 dislike_count: Number of negative ratings of the video
227 repost_count: Number of reposts of the video
228 average_rating: Average rating give by users, the scale used depends on the webpage
229 comment_count: Number of comments on the video
230 comments: A list of comments, each with one or more of the following
231 properties (all but one of text or html optional):
232 * "author" - human-readable name of the comment author
233 * "author_id" - user ID of the comment author
235 * "html" - Comment as HTML
236 * "text" - Plain text of the comment
237 * "timestamp" - UNIX timestamp of comment
238 * "parent" - ID of the comment this one is replying to.
239 Set to "root" to indicate that this is a
240 comment to the original video.
241 age_limit: Age restriction for the video, as an integer (years)
242 webpage_url: The URL to the video webpage, if given to youtube-dl it
243 should allow to get the same result again. (It will be set
244 by YoutubeDL if it's missing)
245 categories: A list of categories that the video falls in, for example
247 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
248 is_live: True, False, or None (=unknown). Whether this video is a
249 live stream that goes on instead of a fixed-length video.
250 start_time: Time in seconds where the reproduction should start, as
251 specified in the URL.
252 end_time: Time in seconds where the reproduction should end, as
253 specified in the URL.
254 chapters: A list of dictionaries, with the following entries:
255 * "start_time" - The start time of the chapter in seconds
256 * "end_time" - The end time of the chapter in seconds
257 * "title" (optional, string)
259 The following fields should only be used when the video belongs to some logical
262 chapter: Name or title of the chapter the video belongs to.
263 chapter_number: Number of the chapter the video belongs to, as an integer.
264 chapter_id: Id of the chapter the video belongs to, as a unicode string.
266 The following fields should only be used when the video is an episode of some
267 series, programme or podcast:
269 series: Title of the series or programme the video episode belongs to.
270 season: Title of the season the video episode belongs to.
271 season_number: Number of the season the video episode belongs to, as an integer.
272 season_id: Id of the season the video episode belongs to, as a unicode string.
273 episode: Title of the video episode. Unlike mandatory video title field,
274 this field should denote the exact title of the video episode
275 without any kind of decoration.
276 episode_number: Number of the video episode within a season, as an integer.
277 episode_id: Id of the video episode, as a unicode string.
279 The following fields should only be used when the media is a track or a part of
282 track: Title of the track.
283 track_number: Number of the track within an album or a disc, as an integer.
284 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
286 artist: Artist(s) of the track.
287 genre: Genre(s) of the track.
288 album: Title of the album the track belongs to.
289 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
290 album_artist: List of all artists appeared on the album (e.g.
291 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
293 disc_number: Number of the disc or other physical medium the track belongs to,
295 release_year: Year (YYYY) when the album was released.
297 Unless mentioned otherwise, the fields should be Unicode strings.
299 Unless mentioned otherwise, None is equivalent to absence of information.
302 _type "playlist" indicates multiple videos.
303 There must be a key "entries", which is a list, an iterable, or a PagedList
304 object, each element of which is a valid dictionary by this specification.
306 Additionally, playlists can have "id", "title", "description", "uploader",
307 "uploader_id", "uploader_url" attributes with the same semantics as videos
311 _type "multi_video" indicates that there are multiple videos that
312 form a single show, for examples multiple acts of an opera or TV episode.
313 It must have an entries key like a playlist and contain all the keys
314 required for a video at the same time.
317 _type "url" indicates that the video must be extracted from another
318 location, possibly by a different extractor. Its only required key is:
319 "url" - the next URL to extract.
320 The key "ie_key" can be set to the class name (minus the trailing "IE",
321 e.g. "Youtube") if the extractor class is known in advance.
322 Additionally, the dictionary may have any properties of the resolved entity
323 known in advance, for example "title" if the title of the referred video is
327 _type "url_transparent" entities have the same specification as "url", but
328 indicate that the given additional information is more precise than the one
329 associated with the resolved URL.
330 This is useful when a site employs a video service that hosts the video and
331 its technical metadata, but that video service does not embed a useful
332 title, description etc.
335 Subclasses of this one should re-define the _real_initialize() and
336 _real_extract() methods and define a _VALID_URL regexp.
337 Probably, they should also be added to the list of extractors.
339 _GEO_BYPASS attribute may be set to False in order to disable
340 geo restriction bypass mechanisms for a particular extractor.
341 Though it won't disable explicit geo restriction bypass based on
342 country code provided with geo_bypass_country. (experimental)
344 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
345 countries for this extractor. One of these countries will be used by
346 geo restriction bypass mechanism right away in order to bypass
347 geo restriction, of course, if the mechanism is not disabled. (experimental)
349 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
350 IP blocks in CIDR notation for this extractor. One of these IP blocks
351 will be used by geo restriction bypass mechanism similarly
352 to _GEO_COUNTRIES. (experimental)
354 NB: both these geo attributes are experimental and may change in future
355 or be completely removed.
357 Finally, the _WORKING attribute should be set to False for broken IEs
358 in order to warn the users and skip the tests.
363 _x_forwarded_for_ip = None
365 _GEO_COUNTRIES = None
366 _GEO_IP_BLOCKS = None
369 def __init__(self, downloader=None):
370 """Constructor. Receives an optional downloader."""
372 self._x_forwarded_for_ip = None
373 self.set_downloader(downloader)
376 def suitable(cls, url):
377 """Receives a URL and returns True if suitable for this IE."""
379 # This does not use has/getattr intentionally - we want to know whether
380 # we have cached the regexp for *this* class, whereas getattr would also
381 # match the superclass
382 if '_VALID_URL_RE' not in cls.__dict__:
383 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
384 return cls._VALID_URL_RE.match(url) is not None
387 def _match_id(cls, url):
388 if '_VALID_URL_RE' not in cls.__dict__:
389 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
390 m = cls._VALID_URL_RE.match(url)
392 return compat_str(m.group('id'))
396 """Getter method for _WORKING."""
399 def initialize(self):
400 """Initializes an instance (authentication, etc)."""
401 self._initialize_geo_bypass({
402 'countries': self._GEO_COUNTRIES,
403 'ip_blocks': self._GEO_IP_BLOCKS,
406 self._real_initialize()
409 def _initialize_geo_bypass(self, geo_bypass_context):
411 Initialize geo restriction bypass mechanism.
413 This method is used to initialize geo bypass mechanism based on faking
414 X-Forwarded-For HTTP header. A random country from provided country list
415 is selected and a random IP belonging to this country is generated. This
416 IP will be passed as X-Forwarded-For HTTP header in all subsequent
419 This method will be used for initial geo bypass mechanism initialization
420 during the instance initialization with _GEO_COUNTRIES and
423 You may also manually call it from extractor's code if geo bypass
424 information is not available beforehand (e.g. obtained during
425 extraction) or due to some other reason. In this case you should pass
426 this information in geo bypass context passed as first argument. It may
427 contain following fields:
429 countries: List of geo unrestricted countries (similar
431 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
432 (similar to _GEO_IP_BLOCKS)
435 if not self._x_forwarded_for_ip:
437 # Geo bypass mechanism is explicitly disabled by user
438 if not self._downloader.params.get('geo_bypass', True):
441 if not geo_bypass_context:
442 geo_bypass_context = {}
444 # Backward compatibility: previously _initialize_geo_bypass
445 # expected a list of countries, some 3rd party code may still use
447 if isinstance(geo_bypass_context, (list, tuple)):
448 geo_bypass_context = {
449 'countries': geo_bypass_context,
452 # The whole point of geo bypass mechanism is to fake IP
453 # as X-Forwarded-For HTTP header based on some IP block or
456 # Path 1: bypassing based on IP block in CIDR notation
458 # Explicit IP block specified by user, use it right away
459 # regardless of whether extractor is geo bypassable or not
460 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
462 # Otherwise use random IP block from geo bypass context but only
463 # if extractor is known as geo bypassable
465 ip_blocks = geo_bypass_context.get('ip_blocks')
466 if self._GEO_BYPASS and ip_blocks:
467 ip_block = random.choice(ip_blocks)
470 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
471 if self._downloader.params.get('verbose', False):
472 self._downloader.to_screen(
473 '[debug] Using fake IP %s as X-Forwarded-For.'
474 % self._x_forwarded_for_ip)
477 # Path 2: bypassing based on country code
479 # Explicit country code specified by user, use it right away
480 # regardless of whether extractor is geo bypassable or not
481 country = self._downloader.params.get('geo_bypass_country', None)
483 # Otherwise use random country code from geo bypass context but
484 # only if extractor is known as geo bypassable
486 countries = geo_bypass_context.get('countries')
487 if self._GEO_BYPASS and countries:
488 country = random.choice(countries)
491 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
492 if self._downloader.params.get('verbose', False):
493 self._downloader.to_screen(
494 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
495 % (self._x_forwarded_for_ip, country.upper()))
497 def extract(self, url):
498 """Extracts URL information and returns it in list of dicts."""
503 ie_result = self._real_extract(url)
504 if self._x_forwarded_for_ip:
505 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
507 except GeoRestrictedError as e:
508 if self.__maybe_fake_ip_and_retry(e.countries):
511 except ExtractorError:
513 except compat_http_client.IncompleteRead as e:
514 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
515 except (KeyError, StopIteration) as e:
516 raise ExtractorError('An extractor error has occurred.', cause=e)
518 def __maybe_fake_ip_and_retry(self, countries):
519 if (not self._downloader.params.get('geo_bypass_country', None) and
521 self._downloader.params.get('geo_bypass', True) and
522 not self._x_forwarded_for_ip and
524 country_code = random.choice(countries)
525 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
526 if self._x_forwarded_for_ip:
528 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
529 % (self._x_forwarded_for_ip, country_code.upper()))
533 def set_downloader(self, downloader):
534 """Sets the downloader for this IE."""
535 self._downloader = downloader
537 def _real_initialize(self):
538 """Real initialization process. Redefine in subclasses."""
541 def _real_extract(self, url):
542 """Real extraction process. Redefine in subclasses."""
547 """A string for getting the InfoExtractor with get_info_extractor"""
548 return compat_str(cls.__name__[:-2])
552 return compat_str(type(self).__name__[:-2])
554 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
555 """ Returns the response handle """
557 self.report_download_webpage(video_id)
558 elif note is not False:
560 self.to_screen('%s' % (note,))
562 self.to_screen('%s: %s' % (video_id, note))
564 # Some sites check X-Forwarded-For HTTP header in order to figure out
565 # the origin of the client behind proxy. This allows bypassing geo
566 # restriction by faking this header's value to IP that belongs to some
567 # geo unrestricted country. We will do so once we encounter any
568 # geo restriction error.
569 if self._x_forwarded_for_ip:
570 if 'X-Forwarded-For' not in headers:
571 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
573 if isinstance(url_or_request, compat_urllib_request.Request):
574 url_or_request = update_Request(
575 url_or_request, data=data, headers=headers, query=query)
578 url_or_request = update_url_query(url_or_request, query)
579 if data is not None or headers:
580 url_or_request = sanitized_Request(url_or_request, data, headers)
582 return self._downloader.urlopen(url_or_request)
583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
587 errnote = 'Unable to download webpage'
589 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
591 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
593 self._downloader.report_warning(errmsg)
596 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
597 """ Returns a tuple (page content as string, URL handle) """
598 # Strip hashes from the URL (#1038)
599 if isinstance(url_or_request, (compat_str, str)):
600 url_or_request = url_or_request.partition('#')[0]
602 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
606 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
607 return (content, urlh)
610 def _guess_encoding_from_content(content_type, webpage_bytes):
611 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
613 encoding = m.group(1)
615 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
616 webpage_bytes[:1024])
618 encoding = m.group(1).decode('ascii')
619 elif webpage_bytes.startswith(b'\xff\xfe'):
626 def __check_blocked(self, content):
627 first_block = content[:512]
628 if ('<title>Access to this site is blocked</title>' in content and
629 'Websense' in first_block):
630 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
631 blocked_iframe = self._html_search_regex(
632 r'<iframe src="([^"]+)"', content,
633 'Websense information URL', default=None)
635 msg += ' Visit %s for more details' % blocked_iframe
636 raise ExtractorError(msg, expected=True)
637 if '<title>The URL you requested has been blocked</title>' in first_block:
639 'Access to this webpage has been blocked by Indian censorship. '
640 'Use a VPN or proxy server (with --proxy) to route around it.')
641 block_msg = self._html_search_regex(
642 r'</h1><p>(.*?)</p>',
643 content, 'block message', default=None)
645 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
646 raise ExtractorError(msg, expected=True)
647 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
648 'blocklist.rkn.gov.ru' in content):
649 raise ExtractorError(
650 'Access to this webpage has been blocked by decision of the Russian government. '
651 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
654 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
655 content_type = urlh.headers.get('Content-Type', '')
656 webpage_bytes = urlh.read()
657 if prefix is not None:
658 webpage_bytes = prefix + webpage_bytes
660 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
661 if self._downloader.params.get('dump_intermediate_pages', False):
662 self.to_screen('Dumping request to ' + urlh.geturl())
663 dump = base64.b64encode(webpage_bytes).decode('ascii')
664 self._downloader.to_screen(dump)
665 if self._downloader.params.get('write_pages', False):
666 basen = '%s_%s' % (video_id, urlh.geturl())
668 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
669 basen = basen[:240 - len(h)] + h
670 raw_filename = basen + '.dump'
671 filename = sanitize_filename(raw_filename, restricted=True)
672 self.to_screen('Saving request to ' + filename)
673 # Working around MAX_PATH limitation on Windows (see
674 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
675 if compat_os_name == 'nt':
676 absfilepath = os.path.abspath(filename)
677 if len(absfilepath) > 259:
678 filename = '\\\\?\\' + absfilepath
679 with open(filename, 'wb') as outf:
680 outf.write(webpage_bytes)
683 content = webpage_bytes.decode(encoding, 'replace')
685 content = webpage_bytes.decode('utf-8', 'replace')
687 self.__check_blocked(content)
691 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
692 """ Returns the data of the page as a string """
695 while success is False:
697 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
699 except compat_http_client.IncompleteRead as e:
701 if try_count >= tries:
703 self._sleep(timeout, video_id)
710 def _download_xml_handle(
711 self, url_or_request, video_id, note='Downloading XML',
712 errnote='Unable to download XML', transform_source=None,
713 fatal=True, encoding=None, data=None, headers={}, query={}):
714 """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
715 res = self._download_webpage_handle(
716 url_or_request, video_id, note, errnote, fatal=fatal,
717 encoding=encoding, data=data, headers=headers, query=query)
720 xml_string, urlh = res
721 return self._parse_xml(
722 xml_string, video_id, transform_source=transform_source,
725 def _download_xml(self, url_or_request, video_id,
726 note='Downloading XML', errnote='Unable to download XML',
727 transform_source=None, fatal=True, encoding=None,
728 data=None, headers={}, query={}):
729 """Return the xml as an xml.etree.ElementTree.Element"""
730 res = self._download_xml_handle(
731 url_or_request, video_id, note=note, errnote=errnote,
732 transform_source=transform_source, fatal=fatal, encoding=encoding,
733 data=data, headers=headers, query=query)
734 return res if res is False else res[0]
736 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
738 xml_string = transform_source(xml_string)
740 return compat_etree_fromstring(xml_string.encode('utf-8'))
741 except compat_xml_parse_error as ve:
742 errmsg = '%s: Failed to parse XML ' % video_id
744 raise ExtractorError(errmsg, cause=ve)
746 self.report_warning(errmsg + str(ve))
748 def _download_json_handle(
749 self, url_or_request, video_id, note='Downloading JSON metadata',
750 errnote='Unable to download JSON metadata', transform_source=None,
751 fatal=True, encoding=None, data=None, headers={}, query={}):
752 """Return a tuple (JSON object, URL handle)"""
753 res = self._download_webpage_handle(
754 url_or_request, video_id, note, errnote, fatal=fatal,
755 encoding=encoding, data=data, headers=headers, query=query)
758 json_string, urlh = res
759 return self._parse_json(
760 json_string, video_id, transform_source=transform_source,
764 self, url_or_request, video_id, note='Downloading JSON metadata',
765 errnote='Unable to download JSON metadata', transform_source=None,
766 fatal=True, encoding=None, data=None, headers={}, query={}):
767 res = self._download_json_handle(
768 url_or_request, video_id, note=note, errnote=errnote,
769 transform_source=transform_source, fatal=fatal, encoding=encoding,
770 data=data, headers=headers, query=query)
771 return res if res is False else res[0]
773 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
775 json_string = transform_source(json_string)
777 return json.loads(json_string)
778 except ValueError as ve:
779 errmsg = '%s: Failed to parse JSON ' % video_id
781 raise ExtractorError(errmsg, cause=ve)
783 self.report_warning(errmsg + str(ve))
785 def report_warning(self, msg, video_id=None):
786 idstr = '' if video_id is None else '%s: ' % video_id
787 self._downloader.report_warning(
788 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
790 def to_screen(self, msg):
791 """Print msg to screen, prefixing it with '[ie_name]'"""
792 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
794 def report_extraction(self, id_or_name):
795 """Report information extraction."""
796 self.to_screen('%s: Extracting information' % id_or_name)
798 def report_download_webpage(self, video_id):
799 """Report webpage download."""
800 self.to_screen('%s: Downloading webpage' % video_id)
802 def report_age_confirmation(self):
803 """Report attempt to confirm age."""
804 self.to_screen('Confirming age')
806 def report_login(self):
807 """Report attempt to log in."""
808 self.to_screen('Logging in')
811 def raise_login_required(msg='This video is only available for registered users'):
812 raise ExtractorError(
813 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
817 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
818 raise GeoRestrictedError(msg, countries=countries)
820 # Methods for following #608
822 def url_result(url, ie=None, video_id=None, video_title=None):
823 """Returns a URL that points to a page that should be processed"""
824 # TODO: ie should be the class used for getting the info
825 video_info = {'_type': 'url',
828 if video_id is not None:
829 video_info['id'] = video_id
830 if video_title is not None:
831 video_info['title'] = video_title
834 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
836 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
838 return self.playlist_result(
839 urls, playlist_id=playlist_id, playlist_title=playlist_title)
842 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
843 """Returns a playlist"""
844 video_info = {'_type': 'playlist',
847 video_info['id'] = playlist_id
849 video_info['title'] = playlist_title
850 if playlist_description:
851 video_info['description'] = playlist_description
854 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
856 Perform a regex search on the given string, using a single or a list of
857 patterns returning the first matching group.
858 In case of failure return a default value or raise a WARNING or a
859 RegexNotFoundError, depending on fatal, specifying the field name.
861 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
862 mobj = re.search(pattern, string, flags)
865 mobj = re.search(p, string, flags)
869 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
870 _name = '\033[0;34m%s\033[0m' % name
876 # return the first matching group
877 return next(g for g in mobj.groups() if g is not None)
879 return mobj.group(group)
880 elif default is not NO_DEFAULT:
883 raise RegexNotFoundError('Unable to extract %s' % _name)
885 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
888 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
890 Like _search_regex, but strips HTML tags and unescapes entities.
892 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
894 return clean_html(res).strip()
898 def _get_netrc_login_info(self, netrc_machine=None):
901 netrc_machine = netrc_machine or self._NETRC_MACHINE
903 if self._downloader.params.get('usenetrc', False):
905 info = netrc.netrc().authenticators(netrc_machine)
910 raise netrc.NetrcParseError(
911 'No authenticators for %s' % netrc_machine)
912 except (IOError, netrc.NetrcParseError) as err:
913 self._downloader.report_warning(
914 'parsing .netrc: %s' % error_to_compat_str(err))
916 return username, password
918 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
920 Get the login info as (username, password)
921 First look for the manually specified credentials using username_option
922 and password_option as keys in params dictionary. If no such credentials
923 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
925 If there's no info available, return (None, None)
927 if self._downloader is None:
930 downloader_params = self._downloader.params
932 # Attempt to use provided username and password or .netrc data
933 if downloader_params.get(username_option) is not None:
934 username = downloader_params[username_option]
935 password = downloader_params[password_option]
937 username, password = self._get_netrc_login_info(netrc_machine)
939 return username, password
941 def _get_tfa_info(self, note='two-factor verification code'):
943 Get the two-factor authentication info
944 TODO - asking the user will be required for sms/phone verify
945 currently just uses the command line option
946 If there's no info available, return None
948 if self._downloader is None:
950 downloader_params = self._downloader.params
952 if downloader_params.get('twofactor') is not None:
953 return downloader_params['twofactor']
955 return compat_getpass('Type %s and press [Return]: ' % note)
957 # Helper functions for extracting OpenGraph info
959 def _og_regexes(prop):
960 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
961 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
962 % {'prop': re.escape(prop)})
963 template = r'<meta[^>]+?%s[^>]+?%s'
965 template % (property_re, content_re),
966 template % (content_re, property_re),
970 def _meta_regex(prop):
971 return r'''(?isx)<meta
972 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
973 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
975 def _og_search_property(self, prop, html, name=None, **kargs):
976 if not isinstance(prop, (list, tuple)):
979 name = 'OpenGraph %s' % prop[0]
982 og_regexes.extend(self._og_regexes(p))
983 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
986 return unescapeHTML(escaped)
988 def _og_search_thumbnail(self, html, **kargs):
989 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
991 def _og_search_description(self, html, **kargs):
992 return self._og_search_property('description', html, fatal=False, **kargs)
994 def _og_search_title(self, html, **kargs):
995 return self._og_search_property('title', html, **kargs)
997 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
998 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1000 regexes = self._og_regexes('video:secure_url') + regexes
1001 return self._html_search_regex(regexes, html, name, **kargs)
1003 def _og_search_url(self, html, **kargs):
1004 return self._og_search_property('url', html, **kargs)
1006 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1007 if not isinstance(name, (list, tuple)):
1009 if display_name is None:
1010 display_name = name[0]
1011 return self._html_search_regex(
1012 [self._meta_regex(n) for n in name],
1013 html, display_name, fatal=fatal, group='content', **kwargs)
1015 def _dc_search_uploader(self, html):
1016 return self._html_search_meta('dc.creator', html, 'uploader')
1018 def _rta_search(self, html):
1019 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1020 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1021 r' content="RTA-5042-1996-1400-1577-RTA"',
1026 def _media_rating_search(self, html):
1027 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1028 rating = self._html_search_meta('rating', html)
1040 return RATING_TABLE.get(rating.lower())
1042 def _family_friendly_search(self, html):
1043 # See http://schema.org/VideoObject
1044 family_friendly = self._html_search_meta(
1045 'isFamilyFriendly', html, default=None)
1047 if not family_friendly:
1056 return RATING_TABLE.get(family_friendly.lower())
1058 def _twitter_search_player(self, html):
1059 return self._html_search_meta('twitter:player', html,
1060 'twitter card player')
1062 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1063 json_ld = self._search_regex(
1064 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1065 html, 'JSON-LD', group='json_ld', **kwargs)
1066 default = kwargs.get('default', NO_DEFAULT)
1068 return default if default is not NO_DEFAULT else {}
1069 # JSON-LD may be malformed and thus `fatal` should be respected.
1070 # At the same time `default` may be passed that assumes `fatal=False`
1071 # for _search_regex. Let's simulate the same behavior here as well.
1072 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1073 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1075 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1076 if isinstance(json_ld, compat_str):
1077 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1081 if not isinstance(json_ld, (list, tuple, dict)):
1083 if isinstance(json_ld, dict):
1086 INTERACTION_TYPE_MAP = {
1087 'CommentAction': 'comment',
1088 'AgreeAction': 'like',
1089 'DisagreeAction': 'dislike',
1090 'LikeAction': 'like',
1091 'DislikeAction': 'dislike',
1092 'ListenAction': 'view',
1093 'WatchAction': 'view',
1094 'ViewAction': 'view',
1097 def extract_interaction_statistic(e):
1098 interaction_statistic = e.get('interactionStatistic')
1099 if not isinstance(interaction_statistic, list):
1101 for is_e in interaction_statistic:
1102 if not isinstance(is_e, dict):
1104 if is_e.get('@type') != 'InteractionCounter':
1106 interaction_type = is_e.get('interactionType')
1107 if not isinstance(interaction_type, compat_str):
1109 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1110 if interaction_count is None:
1112 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1115 count_key = '%s_count' % count_kind
1116 if info.get(count_key) is not None:
1118 info[count_key] = interaction_count
1120 def extract_video_object(e):
1121 assert e['@type'] == 'VideoObject'
1123 'url': e.get('contentUrl'),
1124 'title': unescapeHTML(e.get('name')),
1125 'description': unescapeHTML(e.get('description')),
1126 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1127 'duration': parse_duration(e.get('duration')),
1128 'timestamp': unified_timestamp(e.get('uploadDate')),
1129 'filesize': float_or_none(e.get('contentSize')),
1130 'tbr': int_or_none(e.get('bitrate')),
1131 'width': int_or_none(e.get('width')),
1132 'height': int_or_none(e.get('height')),
1133 'view_count': int_or_none(e.get('interactionCount')),
1135 extract_interaction_statistic(e)
1138 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1139 item_type = e.get('@type')
1140 if expected_type is not None and expected_type != item_type:
1142 if item_type in ('TVEpisode', 'Episode'):
1144 'episode': unescapeHTML(e.get('name')),
1145 'episode_number': int_or_none(e.get('episodeNumber')),
1146 'description': unescapeHTML(e.get('description')),
1148 part_of_season = e.get('partOfSeason')
1149 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1150 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1151 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1152 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1153 info['series'] = unescapeHTML(part_of_series.get('name'))
1154 elif item_type in ('Article', 'NewsArticle'):
1156 'timestamp': parse_iso8601(e.get('datePublished')),
1157 'title': unescapeHTML(e.get('headline')),
1158 'description': unescapeHTML(e.get('articleBody')),
1160 elif item_type == 'VideoObject':
1161 extract_video_object(e)
1163 video = e.get('video')
1164 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1165 extract_video_object(video)
1167 return dict((k, v) for k, v in info.items() if v is not None)
1170 def _hidden_inputs(html):
1171 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1173 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1174 attrs = extract_attributes(input)
1177 if attrs.get('type') not in ('hidden', 'submit'):
1179 name = attrs.get('name') or attrs.get('id')
1180 value = attrs.get('value')
1181 if name and value is not None:
1182 hidden_inputs[name] = value
1183 return hidden_inputs
1185 def _form_hidden_inputs(self, form_id, html):
1186 form = self._search_regex(
1187 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1188 html, '%s form' % form_id, group='form')
1189 return self._hidden_inputs(form)
1191 def _sort_formats(self, formats, field_preference=None):
1193 raise ExtractorError('No video formats found')
1196 # Automatically determine tbr when missing based on abr and vbr (improves
1197 # formats sorting in some cases)
1198 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1199 f['tbr'] = f['abr'] + f['vbr']
1201 def _formats_key(f):
1202 # TODO remove the following workaround
1203 from ..utils import determine_ext
1204 if not f.get('ext') and 'url' in f:
1205 f['ext'] = determine_ext(f['url'])
1207 if isinstance(field_preference, (list, tuple)):
1210 if f.get(field) is not None
1211 else ('' if field == 'format_id' else -1)
1212 for field in field_preference)
1214 preference = f.get('preference')
1215 if preference is None:
1217 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1220 protocol = f.get('protocol') or determine_protocol(f)
1221 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1223 if f.get('vcodec') == 'none': # audio only
1225 if self._downloader.params.get('prefer_free_formats'):
1226 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1228 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1231 audio_ext_preference = ORDER.index(f['ext'])
1233 audio_ext_preference = -1
1235 if f.get('acodec') == 'none': # video only
1237 if self._downloader.params.get('prefer_free_formats'):
1238 ORDER = ['flv', 'mp4', 'webm']
1240 ORDER = ['webm', 'flv', 'mp4']
1242 ext_preference = ORDER.index(f['ext'])
1245 audio_ext_preference = 0
1249 f.get('language_preference') if f.get('language_preference') is not None else -1,
1250 f.get('quality') if f.get('quality') is not None else -1,
1251 f.get('tbr') if f.get('tbr') is not None else -1,
1252 f.get('filesize') if f.get('filesize') is not None else -1,
1253 f.get('vbr') if f.get('vbr') is not None else -1,
1254 f.get('height') if f.get('height') is not None else -1,
1255 f.get('width') if f.get('width') is not None else -1,
1258 f.get('abr') if f.get('abr') is not None else -1,
1259 audio_ext_preference,
1260 f.get('fps') if f.get('fps') is not None else -1,
1261 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1262 f.get('source_preference') if f.get('source_preference') is not None else -1,
1263 f.get('format_id') if f.get('format_id') is not None else '',
1265 formats.sort(key=_formats_key)
1267 def _check_formats(self, formats, video_id):
1269 formats[:] = filter(
1270 lambda f: self._is_valid_url(
1272 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1276 def _remove_duplicate_formats(formats):
1280 if f['url'] not in format_urls:
1281 format_urls.add(f['url'])
1282 unique_formats.append(f)
1283 formats[:] = unique_formats
1285 def _is_valid_url(self, url, video_id, item='video', headers={}):
1286 url = self._proto_relative_url(url, scheme='http:')
1287 # For now assume non HTTP(S) URLs always valid
1288 if not (url.startswith('http://') or url.startswith('https://')):
1291 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1293 except ExtractorError as e:
1294 if isinstance(e.cause, compat_urllib_error.URLError):
1296 '%s: %s URL is invalid, skipping' % (video_id, item))
1300 def http_scheme(self):
1301 """ Either "http:" or "https:", depending on the user's preferences """
1304 if self._downloader.params.get('prefer_insecure', False)
1307 def _proto_relative_url(self, url, scheme=None):
1310 if url.startswith('//'):
1312 scheme = self.http_scheme()
1317 def _sleep(self, timeout, video_id, msg_template=None):
1318 if msg_template is None:
1319 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1320 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1324 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1325 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1326 fatal=True, m3u8_id=None):
1327 manifest = self._download_xml(
1328 manifest_url, video_id, 'Downloading f4m manifest',
1329 'Unable to download f4m manifest',
1330 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1331 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1332 transform_source=transform_source,
1335 if manifest is False:
1338 return self._parse_f4m_formats(
1339 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1340 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1342 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1343 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1344 fatal=True, m3u8_id=None):
1345 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1346 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1347 if akamai_pv is not None and ';' in akamai_pv.text:
1348 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1349 if playerVerificationChallenge.strip() != '':
1353 manifest_version = '1.0'
1354 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1356 manifest_version = '2.0'
1357 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1358 # Remove unsupported DRM protected media from final formats
1359 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1360 media_nodes = remove_encrypted_media(media_nodes)
1364 manifest_base_url = get_base_url(manifest)
1366 bootstrap_info = xpath_element(
1367 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1368 'bootstrap info', default=None)
1371 mime_type = xpath_text(
1372 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1373 'base URL', default=None)
1374 if mime_type and mime_type.startswith('audio/'):
1377 for i, media_el in enumerate(media_nodes):
1378 tbr = int_or_none(media_el.attrib.get('bitrate'))
1379 width = int_or_none(media_el.attrib.get('width'))
1380 height = int_or_none(media_el.attrib.get('height'))
1381 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1382 # If <bootstrapInfo> is present, the specified f4m is a
1383 # stream-level manifest, and only set-level manifests may refer to
1384 # external resources. See section 11.4 and section 4 of F4M spec
1385 if bootstrap_info is None:
1387 # @href is introduced in 2.0, see section 11.6 of F4M spec
1388 if manifest_version == '2.0':
1389 media_url = media_el.attrib.get('href')
1390 if media_url is None:
1391 media_url = media_el.attrib.get('url')
1395 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1396 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1397 # If media_url is itself a f4m manifest do the recursive extraction
1398 # since bitrates in parent manifest (this one) and media_url manifest
1399 # may differ leading to inability to resolve the format by requested
1400 # bitrate in f4m downloader
1401 ext = determine_ext(manifest_url)
1403 f4m_formats = self._extract_f4m_formats(
1404 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1405 transform_source=transform_source, fatal=fatal)
1406 # Sometimes stream-level manifest contains single media entry that
1407 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1408 # At the same time parent's media entry in set-level manifest may
1409 # contain it. We will copy it from parent in such cases.
1410 if len(f4m_formats) == 1:
1413 'tbr': f.get('tbr') or tbr,
1414 'width': f.get('width') or width,
1415 'height': f.get('height') or height,
1416 'format_id': f.get('format_id') if not tbr else format_id,
1419 formats.extend(f4m_formats)
1422 formats.extend(self._extract_m3u8_formats(
1423 manifest_url, video_id, 'mp4', preference=preference,
1424 m3u8_id=m3u8_id, fatal=fatal))
1427 'format_id': format_id,
1428 'url': manifest_url,
1429 'manifest_url': manifest_url,
1430 'ext': 'flv' if bootstrap_info is not None else None,
1436 'preference': preference,
1440 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1442 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1446 'preference': preference - 100 if preference else -100,
1447 'resolution': 'multiple',
1448 'format_note': 'Quality selection URL',
1451 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1452 entry_protocol='m3u8', preference=None,
1453 m3u8_id=None, note=None, errnote=None,
1454 fatal=True, live=False):
1455 res = self._download_webpage_handle(
1457 note=note or 'Downloading m3u8 information',
1458 errnote=errnote or 'Failed to download m3u8 information',
1464 m3u8_doc, urlh = res
1465 m3u8_url = urlh.geturl()
1467 return self._parse_m3u8_formats(
1468 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1469 preference=preference, m3u8_id=m3u8_id, live=live)
1471 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1472 entry_protocol='m3u8', preference=None,
1473 m3u8_id=None, live=False):
1474 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1477 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1482 format_url = lambda u: (
1484 if re.match(r'^https?://', u)
1485 else compat_urlparse.urljoin(m3u8_url, u))
1488 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1489 # 2. https://github.com/rg3/youtube-dl/issues/12211
1491 # We should try extracting formats only from master playlists [1, 4.3.4],
1492 # i.e. playlists that describe available qualities. On the other hand
1493 # media playlists [1, 4.3.3] should be returned as is since they contain
1494 # just the media without qualities renditions.
1495 # Fortunately, master playlist can be easily distinguished from media
1496 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1497 # master playlist tags MUST NOT appear in a media playist and vice versa.
1498 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1499 # media playlist and MUST NOT appear in master playlist thus we can
1500 # clearly detect media playlist with this criterion.
1502 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1505 'format_id': m3u8_id,
1507 'protocol': entry_protocol,
1508 'preference': preference,
1512 last_stream_inf = {}
1514 def extract_media(x_media_line):
1515 media = parse_m3u8_attributes(x_media_line)
1516 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1517 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1518 if not (media_type and group_id and name):
1520 groups.setdefault(group_id, []).append(media)
1521 if media_type not in ('VIDEO', 'AUDIO'):
1523 media_url = media.get('URI')
1526 for v in (m3u8_id, group_id, name):
1530 'format_id': '-'.join(format_id),
1531 'url': format_url(media_url),
1532 'manifest_url': m3u8_url,
1533 'language': media.get('LANGUAGE'),
1535 'protocol': entry_protocol,
1536 'preference': preference,
1538 if media_type == 'AUDIO':
1539 f['vcodec'] = 'none'
1542 def build_stream_name():
1543 # Despite specification does not mention NAME attribute for
1544 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1545 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1546 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1547 stream_name = last_stream_inf.get('NAME')
1550 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1551 # from corresponding rendition group
1552 stream_group_id = last_stream_inf.get('VIDEO')
1553 if not stream_group_id:
1555 stream_group = groups.get(stream_group_id)
1556 if not stream_group:
1557 return stream_group_id
1558 rendition = stream_group[0]
1559 return rendition.get('NAME') or stream_group_id
1561 for line in m3u8_doc.splitlines():
1562 if line.startswith('#EXT-X-STREAM-INF:'):
1563 last_stream_inf = parse_m3u8_attributes(line)
1564 elif line.startswith('#EXT-X-MEDIA:'):
1566 elif line.startswith('#') or not line.strip():
1569 tbr = float_or_none(
1570 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1571 last_stream_inf.get('BANDWIDTH'), scale=1000)
1574 format_id.append(m3u8_id)
1575 stream_name = build_stream_name()
1576 # Bandwidth of live streams may differ over time thus making
1577 # format_id unpredictable. So it's better to keep provided
1580 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1581 manifest_url = format_url(line.strip())
1583 'format_id': '-'.join(format_id),
1584 'url': manifest_url,
1585 'manifest_url': m3u8_url,
1588 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1589 'protocol': entry_protocol,
1590 'preference': preference,
1592 resolution = last_stream_inf.get('RESOLUTION')
1594 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1596 f['width'] = int(mobj.group('width'))
1597 f['height'] = int(mobj.group('height'))
1598 # Unified Streaming Platform
1600 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1602 abr, vbr = mobj.groups()
1603 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1608 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1610 audio_group_id = last_stream_inf.get('AUDIO')
1611 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1612 # references a rendition group MUST have a CODECS attribute.
1613 # However, this is not always respected, for example, [2]
1614 # contains EXT-X-STREAM-INF tag which references AUDIO
1615 # rendition group but does not have CODECS and despite
1616 # referencing audio group an audio group, it represents
1617 # a complete (with audio and video) format. So, for such cases
1618 # we will ignore references to rendition groups and treat them
1619 # as complete formats.
1620 if audio_group_id and codecs and f.get('vcodec') != 'none':
1621 audio_group = groups.get(audio_group_id)
1622 if audio_group and audio_group[0].get('URI'):
1623 # TODO: update acodec for audio only formats with
1625 f['acodec'] = 'none'
1627 last_stream_inf = {}
1631 def _xpath_ns(path, namespace=None):
1635 for c in path.split('/'):
1636 if not c or c == '.':
1639 out.append('{%s}%s' % (namespace, c))
1640 return '/'.join(out)
1642 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1643 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1649 namespace = self._parse_smil_namespace(smil)
1651 return self._parse_smil_formats(
1652 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1654 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1655 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1658 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1660 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1661 return self._download_xml(
1662 smil_url, video_id, 'Downloading SMIL file',
1663 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1665 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1666 namespace = self._parse_smil_namespace(smil)
1668 formats = self._parse_smil_formats(
1669 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1670 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1672 video_id = os.path.splitext(url_basename(smil_url))[0]
1676 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1677 name = meta.attrib.get('name')
1678 content = meta.attrib.get('content')
1679 if not name or not content:
1681 if not title and name == 'title':
1683 elif not description and name in ('description', 'abstract'):
1684 description = content
1685 elif not upload_date and name == 'date':
1686 upload_date = unified_strdate(content)
1689 'id': image.get('type'),
1690 'url': image.get('src'),
1691 'width': int_or_none(image.get('width')),
1692 'height': int_or_none(image.get('height')),
1693 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1697 'title': title or video_id,
1698 'description': description,
1699 'upload_date': upload_date,
1700 'thumbnails': thumbnails,
1702 'subtitles': subtitles,
1705 def _parse_smil_namespace(self, smil):
1706 return self._search_regex(
1707 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1709 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1711 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1712 b = meta.get('base') or meta.get('httpBase')
1723 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1724 for medium in media:
1725 src = medium.get('src')
1726 if not src or src in srcs:
1730 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1731 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1732 width = int_or_none(medium.get('width'))
1733 height = int_or_none(medium.get('height'))
1734 proto = medium.get('proto')
1735 ext = medium.get('ext')
1736 src_ext = determine_ext(src)
1737 streamer = medium.get('streamer') or base
1739 if proto == 'rtmp' or streamer.startswith('rtmp'):
1745 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1747 'filesize': filesize,
1751 if transform_rtmp_url:
1752 streamer, src = transform_rtmp_url(streamer, src)
1753 formats[-1].update({
1759 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1760 src_url = src_url.strip()
1762 if proto == 'm3u8' or src_ext == 'm3u8':
1763 m3u8_formats = self._extract_m3u8_formats(
1764 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1765 if len(m3u8_formats) == 1:
1767 m3u8_formats[0].update({
1768 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1773 formats.extend(m3u8_formats)
1776 if src_ext == 'f4m':
1781 'plugin': 'flowplayer-3.2.0.1',
1783 f4m_url += '&' if '?' in f4m_url else '?'
1784 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1785 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1788 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1792 'ext': ext or src_ext or 'flv',
1793 'format_id': 'http-%d' % (bitrate or http_count),
1795 'filesize': filesize,
1803 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1806 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1807 src = textstream.get('src')
1808 if not src or src in urls:
1811 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1812 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1813 subtitles.setdefault(lang, []).append({
1819 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1820 xspf = self._download_xml(
1821 xspf_url, playlist_id, 'Downloading xpsf playlist',
1822 'Unable to download xspf manifest', fatal=fatal)
1825 return self._parse_xspf(
1826 xspf, playlist_id, xspf_url=xspf_url,
1827 xspf_base_url=base_url(xspf_url))
1829 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1831 'xspf': 'http://xspf.org/ns/0/',
1832 's1': 'http://static.streamone.nl/player/ns/0',
1836 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1838 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1839 description = xpath_text(
1840 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1841 thumbnail = xpath_text(
1842 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1843 duration = float_or_none(
1844 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1847 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1848 format_url = urljoin(xspf_base_url, location.text)
1853 'manifest_url': xspf_url,
1854 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1855 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1856 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1858 self._sort_formats(formats)
1863 'description': description,
1864 'thumbnail': thumbnail,
1865 'duration': duration,
1870 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1871 res = self._download_xml_handle(
1873 note=note or 'Downloading MPD manifest',
1874 errnote=errnote or 'Failed to download MPD manifest',
1879 mpd_base_url = base_url(urlh.geturl())
1881 return self._parse_mpd_formats(
1882 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1883 formats_dict=formats_dict, mpd_url=mpd_url)
1885 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1887 Parse formats from MPD manifest.
1889 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1890 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1891 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1893 if mpd_doc.get('type') == 'dynamic':
1896 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1899 return self._xpath_ns(path, namespace)
1901 def is_drm_protected(element):
1902 return element.find(_add_ns('ContentProtection')) is not None
1904 def extract_multisegment_info(element, ms_parent_info):
1905 ms_info = ms_parent_info.copy()
1907 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1908 # common attributes and elements. We will only extract relevant
1910 def extract_common(source):
1911 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1912 if segment_timeline is not None:
1913 s_e = segment_timeline.findall(_add_ns('S'))
1915 ms_info['total_number'] = 0
1918 r = int(s.get('r', 0))
1919 ms_info['total_number'] += 1 + r
1920 ms_info['s'].append({
1921 't': int(s.get('t', 0)),
1922 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1923 'd': int(s.attrib['d']),
1926 start_number = source.get('startNumber')
1928 ms_info['start_number'] = int(start_number)
1929 timescale = source.get('timescale')
1931 ms_info['timescale'] = int(timescale)
1932 segment_duration = source.get('duration')
1933 if segment_duration:
1934 ms_info['segment_duration'] = float(segment_duration)
1936 def extract_Initialization(source):
1937 initialization = source.find(_add_ns('Initialization'))
1938 if initialization is not None:
1939 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1941 segment_list = element.find(_add_ns('SegmentList'))
1942 if segment_list is not None:
1943 extract_common(segment_list)
1944 extract_Initialization(segment_list)
1945 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1947 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1949 segment_template = element.find(_add_ns('SegmentTemplate'))
1950 if segment_template is not None:
1951 extract_common(segment_template)
1952 media = segment_template.get('media')
1954 ms_info['media'] = media
1955 initialization = segment_template.get('initialization')
1957 ms_info['initialization'] = initialization
1959 extract_Initialization(segment_template)
1962 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1964 for period in mpd_doc.findall(_add_ns('Period')):
1965 period_duration = parse_duration(period.get('duration')) or mpd_duration
1966 period_ms_info = extract_multisegment_info(period, {
1970 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1971 if is_drm_protected(adaptation_set):
1973 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1974 for representation in adaptation_set.findall(_add_ns('Representation')):
1975 if is_drm_protected(representation):
1977 representation_attrib = adaptation_set.attrib.copy()
1978 representation_attrib.update(representation.attrib)
1979 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1980 mime_type = representation_attrib['mimeType']
1981 content_type = mime_type.split('/')[0]
1982 if content_type == 'text':
1983 # TODO implement WebVTT downloading
1985 elif content_type in ('video', 'audio'):
1987 for element in (representation, adaptation_set, period, mpd_doc):
1988 base_url_e = element.find(_add_ns('BaseURL'))
1989 if base_url_e is not None:
1990 base_url = base_url_e.text + base_url
1991 if re.match(r'^https?://', base_url):
1993 if mpd_base_url and not re.match(r'^https?://', base_url):
1994 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1996 base_url = mpd_base_url + base_url
1997 representation_id = representation_attrib.get('id')
1998 lang = representation_attrib.get('lang')
1999 url_el = representation.find(_add_ns('BaseURL'))
2000 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2001 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2003 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2005 'manifest_url': mpd_url,
2006 'ext': mimetype2ext(mime_type),
2007 'width': int_or_none(representation_attrib.get('width')),
2008 'height': int_or_none(representation_attrib.get('height')),
2009 'tbr': float_or_none(bandwidth, 1000),
2010 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2011 'fps': int_or_none(representation_attrib.get('frameRate')),
2012 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2013 'format_note': 'DASH %s' % content_type,
2014 'filesize': filesize,
2015 'container': mimetype2ext(mime_type) + '_dash',
2017 f.update(parse_codecs(representation_attrib.get('codecs')))
2018 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2020 def prepare_template(template_name, identifiers):
2021 t = representation_ms_info[template_name]
2022 t = t.replace('$RepresentationID$', representation_id)
2023 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2024 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2025 t.replace('$$', '$')
2028 # @initialization is a regular template like @media one
2029 # so it should be handled just the same way (see
2030 # https://github.com/rg3/youtube-dl/issues/11605)
2031 if 'initialization' in representation_ms_info:
2032 initialization_template = prepare_template(
2034 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2035 # $Time$ shall not be included for @initialization thus
2036 # only $Bandwidth$ remains
2038 representation_ms_info['initialization_url'] = initialization_template % {
2039 'Bandwidth': bandwidth,
2042 def location_key(location):
2043 return 'url' if re.match(r'^https?://', location) else 'path'
2045 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2047 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2048 media_location_key = location_key(media_template)
2050 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2051 # can't be used at the same time
2052 if '%(Number' in media_template and 's' not in representation_ms_info:
2053 segment_duration = None
2054 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2055 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2056 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2057 representation_ms_info['fragments'] = [{
2058 media_location_key: media_template % {
2059 'Number': segment_number,
2060 'Bandwidth': bandwidth,
2062 'duration': segment_duration,
2063 } for segment_number in range(
2064 representation_ms_info['start_number'],
2065 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2067 # $Number*$ or $Time$ in media template with S list available
2068 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2069 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2070 representation_ms_info['fragments'] = []
2073 segment_number = representation_ms_info['start_number']
2075 def add_segment_url():
2076 segment_url = media_template % {
2077 'Time': segment_time,
2078 'Bandwidth': bandwidth,
2079 'Number': segment_number,
2081 representation_ms_info['fragments'].append({
2082 media_location_key: segment_url,
2083 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2086 for num, s in enumerate(representation_ms_info['s']):
2087 segment_time = s.get('t') or segment_time
2091 for r in range(s.get('r', 0)):
2092 segment_time += segment_d
2095 segment_time += segment_d
2096 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2098 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2099 # or any YouTube dashsegments video
2102 timescale = representation_ms_info['timescale']
2103 for s in representation_ms_info['s']:
2104 duration = float_or_none(s['d'], timescale)
2105 for r in range(s.get('r', 0) + 1):
2106 segment_uri = representation_ms_info['segment_urls'][segment_index]
2108 location_key(segment_uri): segment_uri,
2109 'duration': duration,
2112 representation_ms_info['fragments'] = fragments
2113 elif 'segment_urls' in representation_ms_info:
2114 # Segment URLs with no SegmentTimeline
2115 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2116 # https://github.com/rg3/youtube-dl/pull/14844
2118 segment_duration = float_or_none(
2119 representation_ms_info['segment_duration'],
2120 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2121 for segment_url in representation_ms_info['segment_urls']:
2123 location_key(segment_url): segment_url,
2125 if segment_duration:
2126 fragment['duration'] = segment_duration
2127 fragments.append(fragment)
2128 representation_ms_info['fragments'] = fragments
2129 # NB: MPD manifest may contain direct URLs to unfragmented media.
2130 # No fragments key is present in this case.
2131 if 'fragments' in representation_ms_info:
2133 'fragment_base_url': base_url,
2135 'protocol': 'http_dash_segments',
2137 if 'initialization_url' in representation_ms_info:
2138 initialization_url = representation_ms_info['initialization_url']
2139 if not f.get('url'):
2140 f['url'] = initialization_url
2141 f['fragments'].append({location_key(initialization_url): initialization_url})
2142 f['fragments'].extend(representation_ms_info['fragments'])
2143 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2144 # is not necessarily unique within a Period thus formats with
2145 # the same `format_id` are quite possible. There are numerous examples
2146 # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2147 # https://github.com/rg3/youtube-dl/issues/13919)
2148 full_info = formats_dict.get(representation_id, {}).copy()
2150 formats.append(full_info)
2152 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2155 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2156 res = self._download_xml_handle(
2158 note=note or 'Downloading ISM manifest',
2159 errnote=errnote or 'Failed to download ISM manifest',
2165 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2167 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2169 Parse formats from ISM manifest.
2171 1. [MS-SSTR]: Smooth Streaming Protocol,
2172 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2174 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2177 duration = int(ism_doc.attrib['Duration'])
2178 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2181 for stream in ism_doc.findall('StreamIndex'):
2182 stream_type = stream.get('Type')
2183 if stream_type not in ('video', 'audio'):
2185 url_pattern = stream.attrib['Url']
2186 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2187 stream_name = stream.get('Name')
2188 for track in stream.findall('QualityLevel'):
2189 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2190 # TODO: add support for WVC1 and WMAP
2191 if fourcc not in ('H264', 'AVC1', 'AACL'):
2192 self.report_warning('%s is not a supported codec' % fourcc)
2194 tbr = int(track.attrib['Bitrate']) // 1000
2195 # [1] does not mention Width and Height attributes. However,
2196 # they're often present while MaxWidth and MaxHeight are
2197 # missing, so should be used as fallbacks
2198 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2199 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2200 sampling_rate = int_or_none(track.get('SamplingRate'))
2202 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2203 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2209 stream_fragments = stream.findall('c')
2210 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2211 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2212 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2213 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2214 if not fragment_ctx['duration']:
2216 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2218 next_fragment_time = duration
2219 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2220 for _ in range(fragment_repeat):
2222 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2223 'duration': fragment_ctx['duration'] / stream_timescale,
2225 fragment_ctx['time'] += fragment_ctx['duration']
2229 format_id.append(ism_id)
2231 format_id.append(stream_name)
2232 format_id.append(compat_str(tbr))
2235 'format_id': '-'.join(format_id),
2237 'manifest_url': ism_url,
2238 'ext': 'ismv' if stream_type == 'video' else 'isma',
2242 'asr': sampling_rate,
2243 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2244 'acodec': 'none' if stream_type == 'video' else fourcc,
2246 'fragments': fragments,
2247 '_download_params': {
2248 'duration': duration,
2249 'timescale': stream_timescale,
2250 'width': width or 0,
2251 'height': height or 0,
2253 'codec_private_data': track.get('CodecPrivateData'),
2254 'sampling_rate': sampling_rate,
2255 'channels': int_or_none(track.get('Channels', 2)),
2256 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2257 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2262 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2263 def absolute_url(item_url):
2264 return urljoin(base_url, item_url)
2266 def parse_content_type(content_type):
2267 if not content_type:
2269 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2271 mimetype, codecs = ctr.groups()
2272 f = parse_codecs(codecs)
2273 f['ext'] = mimetype2ext(mimetype)
2277 def _media_formats(src, cur_media_type, type_info={}):
2278 full_url = absolute_url(src)
2279 ext = type_info.get('ext') or determine_ext(full_url)
2281 is_plain_url = False
2282 formats = self._extract_m3u8_formats(
2283 full_url, video_id, ext='mp4',
2284 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2285 preference=preference, fatal=False)
2287 is_plain_url = False
2288 formats = self._extract_mpd_formats(
2289 full_url, video_id, mpd_id=mpd_id, fatal=False)
2294 'vcodec': 'none' if cur_media_type == 'audio' else None,
2296 return is_plain_url, formats
2299 # amp-video and amp-audio are very similar to their HTML5 counterparts
2300 # so we wll include them right here (see
2301 # https://www.ampproject.org/docs/reference/components/amp-video)
2302 media_tags = [(media_tag, media_type, '')
2303 for media_tag, media_type
2304 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2305 media_tags.extend(re.findall(
2306 # We only allow video|audio followed by a whitespace or '>'.
2307 # Allowing more characters may end up in significant slow down (see
2308 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2309 # http://www.porntrex.com/maps/videositemap.xml).
2310 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2311 for media_tag, media_type, media_content in media_tags:
2316 media_attributes = extract_attributes(media_tag)
2317 src = media_attributes.get('src')
2319 _, formats = _media_formats(src, media_type)
2320 media_info['formats'].extend(formats)
2321 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2323 for source_tag in re.findall(r'<source[^>]+>', media_content):
2324 source_attributes = extract_attributes(source_tag)
2325 src = source_attributes.get('src')
2328 f = parse_content_type(source_attributes.get('type'))
2329 is_plain_url, formats = _media_formats(src, media_type, f)
2331 # res attribute is not standard but seen several times
2334 'height': int_or_none(source_attributes.get('res')),
2335 'format_id': source_attributes.get('label'),
2337 f.update(formats[0])
2338 media_info['formats'].append(f)
2340 media_info['formats'].extend(formats)
2341 for track_tag in re.findall(r'<track[^>]+>', media_content):
2342 track_attributes = extract_attributes(track_tag)
2343 kind = track_attributes.get('kind')
2344 if not kind or kind in ('subtitles', 'captions'):
2345 src = track_attributes.get('src')
2348 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2349 media_info['subtitles'].setdefault(lang, []).append({
2350 'url': absolute_url(src),
2352 if media_info['formats'] or media_info['subtitles']:
2353 entries.append(media_info)
2356 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2358 hdcore_sign = 'hdcore=3.7.0'
2359 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2360 hds_host = hosts.get('hds')
2362 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2363 if 'hdcore=' not in f4m_url:
2364 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2365 f4m_formats = self._extract_f4m_formats(
2366 f4m_url, video_id, f4m_id='hds', fatal=False)
2367 for entry in f4m_formats:
2368 entry.update({'extra_param_to_segment_url': hdcore_sign})
2369 formats.extend(f4m_formats)
2370 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2371 hls_host = hosts.get('hls')
2373 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2374 formats.extend(self._extract_m3u8_formats(
2375 m3u8_url, video_id, 'mp4', 'm3u8_native',
2376 m3u8_id='hls', fatal=False))
2379 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2380 query = compat_urlparse.urlparse(url).query
2381 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2383 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2384 url_base = mobj.group('url')
2385 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2388 def manifest_url(manifest):
2389 m_url = '%s/%s' % (http_base_url, manifest)
2391 m_url += '?%s' % query
2394 if 'm3u8' not in skip_protocols:
2395 formats.extend(self._extract_m3u8_formats(
2396 manifest_url('playlist.m3u8'), video_id, 'mp4',
2397 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2398 if 'f4m' not in skip_protocols:
2399 formats.extend(self._extract_f4m_formats(
2400 manifest_url('manifest.f4m'),
2401 video_id, f4m_id='hds', fatal=False))
2402 if 'dash' not in skip_protocols:
2403 formats.extend(self._extract_mpd_formats(
2404 manifest_url('manifest.mpd'),
2405 video_id, mpd_id='dash', fatal=False))
2406 if re.search(r'(?:/smil:|\.smil)', url_base):
2407 if 'smil' not in skip_protocols:
2408 rtmp_formats = self._extract_smil_formats(
2409 manifest_url('jwplayer.smil'),
2410 video_id, fatal=False)
2411 for rtmp_format in rtmp_formats:
2412 rtsp_format = rtmp_format.copy()
2413 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2414 del rtsp_format['play_path']
2415 del rtsp_format['ext']
2416 rtsp_format.update({
2417 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2418 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2421 formats.extend([rtmp_format, rtsp_format])
2423 for protocol in ('rtmp', 'rtsp'):
2424 if protocol not in skip_protocols:
2426 'url': '%s:%s' % (protocol, url_base),
2427 'format_id': protocol,
2428 'protocol': protocol,
2432 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2434 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2438 jwplayer_data = self._parse_json(mobj.group('options'),
2440 transform_source=transform_source)
2441 except ExtractorError:
2444 if isinstance(jwplayer_data, dict):
2445 return jwplayer_data
2447 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2448 jwplayer_data = self._find_jwplayer_data(
2449 webpage, video_id, transform_source=js_to_json)
2450 return self._parse_jwplayer_data(
2451 jwplayer_data, video_id, *args, **kwargs)
2453 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2454 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2455 # JWPlayer backward compatibility: flattened playlists
2456 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2457 if 'playlist' not in jwplayer_data:
2458 jwplayer_data = {'playlist': [jwplayer_data]}
2462 # JWPlayer backward compatibility: single playlist item
2463 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2464 if not isinstance(jwplayer_data['playlist'], list):
2465 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2467 for video_data in jwplayer_data['playlist']:
2468 # JWPlayer backward compatibility: flattened sources
2469 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2470 if 'sources' not in video_data:
2471 video_data['sources'] = [video_data]
2473 this_video_id = video_id or video_data['mediaid']
2475 formats = self._parse_jwplayer_formats(
2476 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2477 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2480 tracks = video_data.get('tracks')
2481 if tracks and isinstance(tracks, list):
2482 for track in tracks:
2483 if not isinstance(track, dict):
2485 track_kind = track.get('kind')
2486 if not track_kind or not isinstance(track_kind, compat_str):
2488 if track_kind.lower() not in ('captions', 'subtitles'):
2490 track_url = urljoin(base_url, track.get('file'))
2493 subtitles.setdefault(track.get('label') or 'en', []).append({
2494 'url': self._proto_relative_url(track_url)
2498 'id': this_video_id,
2499 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2500 'description': video_data.get('description'),
2501 'thumbnail': self._proto_relative_url(video_data.get('image')),
2502 'timestamp': int_or_none(video_data.get('pubdate')),
2503 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2504 'subtitles': subtitles,
2506 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2507 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2509 '_type': 'url_transparent',
2510 'url': formats[0]['url'],
2513 self._sort_formats(formats)
2514 entry['formats'] = formats
2515 entries.append(entry)
2516 if len(entries) == 1:
2519 return self.playlist_result(entries)
2521 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2522 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2525 for source in jwplayer_sources_data:
2526 if not isinstance(source, dict):
2528 source_url = self._proto_relative_url(source.get('file'))
2532 source_url = compat_urlparse.urljoin(base_url, source_url)
2533 if source_url in urls:
2535 urls.append(source_url)
2536 source_type = source.get('type') or ''
2537 ext = mimetype2ext(source_type) or determine_ext(source_url)
2538 if source_type == 'hls' or ext == 'm3u8':
2539 formats.extend(self._extract_m3u8_formats(
2540 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2541 m3u8_id=m3u8_id, fatal=False))
2542 elif source_type == 'dash' or ext == 'mpd':
2543 formats.extend(self._extract_mpd_formats(
2544 source_url, video_id, mpd_id=mpd_id, fatal=False))
2546 formats.extend(self._extract_smil_formats(
2547 source_url, video_id, fatal=False))
2548 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2549 elif source_type.startswith('audio') or ext in (
2550 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2557 height = int_or_none(source.get('height'))
2559 # Often no height is provided but there is a label in
2560 # format like "1080p", "720p SD", or 1080.
2561 height = int_or_none(self._search_regex(
2562 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2563 'height', default=None))
2566 'width': int_or_none(source.get('width')),
2568 'tbr': int_or_none(source.get('bitrate')),
2571 if source_url.startswith('rtmp'):
2572 a_format['ext'] = 'flv'
2573 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2574 # of jwplayer.flash.swf
2575 rtmp_url_parts = re.split(
2576 r'((?:mp4|mp3|flv):)', source_url, 1)
2577 if len(rtmp_url_parts) == 3:
2578 rtmp_url, prefix, play_path = rtmp_url_parts
2581 'play_path': prefix + play_path,
2584 a_format.update(rtmp_params)
2585 formats.append(a_format)
2588 def _live_title(self, name):
2589 """ Generate the title for a live video """
2590 now = datetime.datetime.now()
2591 now_str = now.strftime('%Y-%m-%d %H:%M')
2592 return name + ' ' + now_str
2594 def _int(self, v, name, fatal=False, **kwargs):
2595 res = int_or_none(v, **kwargs)
2596 if 'get_attr' in kwargs:
2597 print(getattr(v, kwargs['get_attr']))
2599 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2601 raise ExtractorError(msg)
2603 self._downloader.report_warning(msg)
2606 def _float(self, v, name, fatal=False, **kwargs):
2607 res = float_or_none(v, **kwargs)
2609 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2611 raise ExtractorError(msg)
2613 self._downloader.report_warning(msg)
2616 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2617 path='/', secure=False, discard=False, rest={}, **kwargs):
2618 cookie = compat_cookiejar.Cookie(
2619 0, name, value, port, port is not None, domain, True,
2620 domain.startswith('.'), path, True, secure, expire_time,
2621 discard, None, None, rest)
2622 self._downloader.cookiejar.set_cookie(cookie)
2624 def _get_cookies(self, url):
2625 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2626 req = sanitized_Request(url)
2627 self._downloader.cookiejar.add_cookie_header(req)
2628 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2630 def get_testcases(self, include_onlymatching=False):
2631 t = getattr(self, '_TEST', None)
2633 assert not hasattr(self, '_TESTS'), \
2634 '%s has _TEST and _TESTS' % type(self).__name__
2637 tests = getattr(self, '_TESTS', [])
2639 if not include_onlymatching and t.get('only_matching', False):
2641 t['name'] = type(self).__name__[:-len('IE')]
2644 def is_suitable(self, age_limit):
2645 """ Test whether the extractor is generally suitable for the given
2646 age limit (i.e. pornographic sites are not, all others usually are) """
2648 any_restricted = False
2649 for tc in self.get_testcases(include_onlymatching=False):
2650 if tc.get('playlist', []):
2651 tc = tc['playlist'][0]
2652 is_restricted = age_restricted(
2653 tc.get('info_dict', {}).get('age_limit'), age_limit)
2654 if not is_restricted:
2656 any_restricted = any_restricted or is_restricted
2657 return not any_restricted
2659 def extract_subtitles(self, *args, **kwargs):
2660 if (self._downloader.params.get('writesubtitles', False) or
2661 self._downloader.params.get('listsubtitles')):
2662 return self._get_subtitles(*args, **kwargs)
2665 def _get_subtitles(self, *args, **kwargs):
2666 raise NotImplementedError('This method must be implemented by subclasses')
2669 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2670 """ Merge subtitle items for one language. Items with duplicated URLs
2671 will be dropped. """
2672 list1_urls = set([item['url'] for item in subtitle_list1])
2673 ret = list(subtitle_list1)
2674 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2678 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2679 """ Merge two subtitle dictionaries, language by language. """
2680 ret = dict(subtitle_dict1)
2681 for lang in subtitle_dict2:
2682 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2685 def extract_automatic_captions(self, *args, **kwargs):
2686 if (self._downloader.params.get('writeautomaticsub', False) or
2687 self._downloader.params.get('listsubtitles')):
2688 return self._get_automatic_captions(*args, **kwargs)
2691 def _get_automatic_captions(self, *args, **kwargs):
2692 raise NotImplementedError('This method must be implemented by subclasses')
2694 def mark_watched(self, *args, **kwargs):
2695 if (self._downloader.params.get('mark_watched', False) and
2696 (self._get_login_info()[0] is not None or
2697 self._downloader.params.get('cookiefile') is not None)):
2698 self._mark_watched(*args, **kwargs)
2700 def _mark_watched(self, *args, **kwargs):
2701 raise NotImplementedError('This method must be implemented by subclasses')
2703 def geo_verification_headers(self):
2705 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2706 if geo_verification_proxy:
2707 headers['Ytdl-request-proxy'] = geo_verification_proxy
2710 def _generic_id(self, url):
2711 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2713 def _generic_title(self, url):
2714 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2717 class SearchInfoExtractor(InfoExtractor):
2719 Base class for paged search queries extractors.
2720 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2721 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2725 def _make_valid_url(cls):
2726 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2729 def suitable(cls, url):
2730 return re.match(cls._make_valid_url(), url) is not None
2732 def _real_extract(self, query):
2733 mobj = re.match(self._make_valid_url(), query)
2735 raise ExtractorError('Invalid search query "%s"' % query)
2737 prefix = mobj.group('prefix')
2738 query = mobj.group('query')
2740 return self._get_n_results(query, 1)
2741 elif prefix == 'all':
2742 return self._get_n_results(query, self._MAX_RESULTS)
2746 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2747 elif n > self._MAX_RESULTS:
2748 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2749 n = self._MAX_RESULTS
2750 return self._get_n_results(query, n)
2752 def _get_n_results(self, query, n):
2753 """Get a specified number of results for a query"""
2754 raise NotImplementedError('This method must be implemented by subclasses')
2757 def SEARCH_KEY(self):
2758 return self._SEARCH_KEY