1 from __future__ import unicode_literals
16 from ..compat import (
19 compat_etree_fromstring,
25 compat_urllib_parse_unquote,
26 compat_urllib_parse_urlencode,
27 compat_urllib_request,
30 from ..downloader.f4m import remove_encrypted_media
63 parse_m3u8_attributes,
70 class InfoExtractor(object):
71 """Information Extractor class.
73 Information extractors are the classes that, given a URL, extract
74 information about the video (or videos) the URL refers to. This
75 information includes the real video URL, the video title, author and
76 others. The information is stored in a dictionary which is then
77 passed to the YoutubeDL. The YoutubeDL processes this
78 information possibly downloading the video to the file system, among
79 other possible outcomes.
81 The type field determines the type of the result.
82 By far the most common value (and the default if _type is missing) is
83 "video", which indicates a single video.
85 For a video, the dictionaries must include the following fields:
88 title: Video title, unescaped.
90 Additionally, it must contain either a formats entry or a url one:
92 formats: A list of dictionaries for each format available, ordered
93 from worst to best quality.
96 * url Mandatory. The URL of the video file
98 The URL of the manifest file in case of
99 fragmented media (DASH, hls, hds)
100 * ext Will be calculated from URL if missing
101 * format A human-readable description of the format
102 ("mp4 container with h264/opus").
103 Calculated from the format_id, width, height.
104 and format_note fields if missing.
105 * format_id A short description of the format
106 ("mp4_h264_opus" or "19").
107 Technically optional, but strongly recommended.
108 * format_note Additional info about the format
109 ("3D" or "DASH video")
110 * width Width of the video, if known
111 * height Height of the video, if known
112 * resolution Textual description of width and height
113 * tbr Average bitrate of audio and video in KBit/s
114 * abr Average audio bitrate in KBit/s
115 * acodec Name of the audio codec in use
116 * asr Audio sampling rate in Hertz
117 * vbr Average video bitrate in KBit/s
119 * vcodec Name of the video codec in use
120 * container Name of the container format
121 * filesize The number of bytes, if known in advance
122 * filesize_approx An estimate for the number of bytes
123 * player_url SWF Player URL (used for rtmpdump).
124 * protocol The protocol that will be used for the actual
125 download, lower-case.
126 "http", "https", "rtsp", "rtmp", "rtmpe",
127 "m3u8", "m3u8_native" or "http_dash_segments".
129 Base URL for fragments. Each fragment's path
130 value (if present) will be relative to
132 * fragments A list of fragments of a fragmented media.
133 Each fragment entry must contain either an url
134 or a path. If an url is present it should be
135 considered by a client. Otherwise both path and
136 fragment_base_url must be present. Here is
137 the list of all potential fields:
138 * "url" - fragment's URL
139 * "path" - fragment's path relative to
141 * "duration" (optional, int or float)
142 * "filesize" (optional, int)
143 * preference Order number of this format. If this field is
144 present and not None, the formats get sorted
145 by this field, regardless of all other values.
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 < -1000 to hide the format (if there is
149 another one which is strictly better)
150 * language Language code, e.g. "de" or "en-US".
151 * language_preference Is this in the language mentioned in
153 10 if it's what the URL is about,
154 -1 for default (don't know),
155 -10 otherwise, other values reserved for now.
156 * quality Order number of the video quality of this
157 format, irrespective of the file format.
158 -1 for default (order by other properties),
159 -2 or smaller for less than default.
160 * source_preference Order number for this video source
161 (quality takes higher priority)
162 -1 for default (order by other properties),
163 -2 or smaller for less than default.
164 * http_headers A dictionary of additional HTTP headers
165 to add to the request.
166 * stretched_ratio If given and not 1, indicates that the
167 video's pixels are not square.
168 width : height ratio as float.
169 * no_resume The server does not support resuming the
170 (HTTP or RTMP) download. Boolean.
172 url: Final video URL.
173 ext: Video filename extension.
174 format: The video format, defaults to ext (used for --get-format)
175 player_url: SWF Player URL (used for rtmpdump).
177 The following fields are optional:
179 alt_title: A secondary title of the video.
180 display_id An alternative identifier for the video, not necessarily
181 unique, but available before title. Typically, id is
182 something like "4234987", title "Dancing naked mole rats",
183 and display_id "dancing-naked-mole-rats"
184 thumbnails: A list of dictionaries, with the following entries:
185 * "id" (optional, string) - Thumbnail format ID
187 * "preference" (optional, int) - quality of the image
188 * "width" (optional, int)
189 * "height" (optional, int)
190 * "resolution" (optional, string "{width}x{height"},
192 * "filesize" (optional, int)
193 thumbnail: Full URL to a video thumbnail image.
194 description: Full video description.
195 uploader: Full name of the video uploader.
196 license: License name the video is licensed under.
197 creator: The creator of the video.
198 release_date: The date (YYYYMMDD) when the video was released.
199 timestamp: UNIX timestamp of the moment the video became available.
200 upload_date: Video upload date (YYYYMMDD).
201 If not explicitly set, calculated from timestamp.
202 uploader_id: Nickname or id of the video uploader.
203 uploader_url: Full URL to a personal webpage of the video uploader.
204 location: Physical location where the video was filmed.
205 subtitles: The available subtitles as a dictionary in the format
206 {tag: subformats}. "tag" is usually a language code, and
207 "subformats" is a list sorted from lower to higher
208 preference, each element is a dictionary with the "ext"
210 * "data": The subtitles file contents
211 * "url": A URL pointing to the subtitles file
212 "ext" will be calculated from URL if missing
213 automatic_captions: Like 'subtitles', used by the YoutubeIE for
214 automatically generated captions
215 duration: Length of the video in seconds, as an integer or float.
216 view_count: How many users have watched the video on the platform.
217 like_count: Number of positive ratings of the video
218 dislike_count: Number of negative ratings of the video
219 repost_count: Number of reposts of the video
220 average_rating: Average rating give by users, the scale used depends on the webpage
221 comment_count: Number of comments on the video
222 comments: A list of comments, each with one or more of the following
223 properties (all but one of text or html optional):
224 * "author" - human-readable name of the comment author
225 * "author_id" - user ID of the comment author
227 * "html" - Comment as HTML
228 * "text" - Plain text of the comment
229 * "timestamp" - UNIX timestamp of comment
230 * "parent" - ID of the comment this one is replying to.
231 Set to "root" to indicate that this is a
232 comment to the original video.
233 age_limit: Age restriction for the video, as an integer (years)
234 webpage_url: The URL to the video webpage, if given to youtube-dl it
235 should allow to get the same result again. (It will be set
236 by YoutubeDL if it's missing)
237 categories: A list of categories that the video falls in, for example
239 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240 is_live: True, False, or None (=unknown). Whether this video is a
241 live stream that goes on instead of a fixed-length video.
242 start_time: Time in seconds where the reproduction should start, as
243 specified in the URL.
244 end_time: Time in seconds where the reproduction should end, as
245 specified in the URL.
247 The following fields should only be used when the video belongs to some logical
250 chapter: Name or title of the chapter the video belongs to.
251 chapter_number: Number of the chapter the video belongs to, as an integer.
252 chapter_id: Id of the chapter the video belongs to, as a unicode string.
254 The following fields should only be used when the video is an episode of some
255 series, programme or podcast:
257 series: Title of the series or programme the video episode belongs to.
258 season: Title of the season the video episode belongs to.
259 season_number: Number of the season the video episode belongs to, as an integer.
260 season_id: Id of the season the video episode belongs to, as a unicode string.
261 episode: Title of the video episode. Unlike mandatory video title field,
262 this field should denote the exact title of the video episode
263 without any kind of decoration.
264 episode_number: Number of the video episode within a season, as an integer.
265 episode_id: Id of the video episode, as a unicode string.
267 The following fields should only be used when the media is a track or a part of
270 track: Title of the track.
271 track_number: Number of the track within an album or a disc, as an integer.
272 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 artist: Artist(s) of the track.
275 genre: Genre(s) of the track.
276 album: Title of the album the track belongs to.
277 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278 album_artist: List of all artists appeared on the album (e.g.
279 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 disc_number: Number of the disc or other physical medium the track belongs to,
283 release_year: Year (YYYY) when the album was released.
285 Unless mentioned otherwise, the fields should be Unicode strings.
287 Unless mentioned otherwise, None is equivalent to absence of information.
290 _type "playlist" indicates multiple videos.
291 There must be a key "entries", which is a list, an iterable, or a PagedList
292 object, each element of which is a valid dictionary by this specification.
294 Additionally, playlists can have "title", "description" and "id" attributes
295 with the same semantics as videos (see above).
298 _type "multi_video" indicates that there are multiple videos that
299 form a single show, for examples multiple acts of an opera or TV episode.
300 It must have an entries key like a playlist and contain all the keys
301 required for a video at the same time.
304 _type "url" indicates that the video must be extracted from another
305 location, possibly by a different extractor. Its only required key is:
306 "url" - the next URL to extract.
307 The key "ie_key" can be set to the class name (minus the trailing "IE",
308 e.g. "Youtube") if the extractor class is known in advance.
309 Additionally, the dictionary may have any properties of the resolved entity
310 known in advance, for example "title" if the title of the referred video is
314 _type "url_transparent" entities have the same specification as "url", but
315 indicate that the given additional information is more precise than the one
316 associated with the resolved URL.
317 This is useful when a site employs a video service that hosts the video and
318 its technical metadata, but that video service does not embed a useful
319 title, description etc.
322 Subclasses of this one should re-define the _real_initialize() and
323 _real_extract() methods and define a _VALID_URL regexp.
324 Probably, they should also be added to the list of extractors.
326 _BYPASS_GEO attribute may be set to False in order to disable
327 geo restriction bypass mechanisms for a particular extractor.
328 Though it won't disable explicit geo restriction bypass based on
329 country code provided with bypass_geo_restriction_as_country.
331 Finally, the _WORKING attribute should be set to False for broken IEs
332 in order to warn the users and skip the tests.
337 _x_forwarded_for_ip = None
341 def __init__(self, downloader=None):
342 """Constructor. Receives an optional downloader."""
344 self._x_forwarded_for_ip = None
345 self.set_downloader(downloader)
348 def suitable(cls, url):
349 """Receives a URL and returns True if suitable for this IE."""
351 # This does not use has/getattr intentionally - we want to know whether
352 # we have cached the regexp for *this* class, whereas getattr would also
353 # match the superclass
354 if '_VALID_URL_RE' not in cls.__dict__:
355 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
356 return cls._VALID_URL_RE.match(url) is not None
359 def _match_id(cls, url):
360 if '_VALID_URL_RE' not in cls.__dict__:
361 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
362 m = cls._VALID_URL_RE.match(url)
368 """Getter method for _WORKING."""
371 def initialize(self):
372 """Initializes an instance (authentication, etc)."""
373 if not self._x_forwarded_for_ip:
374 country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None)
376 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
378 self._real_initialize()
381 def extract(self, url):
382 """Extracts URL information and returns it in list of dicts."""
387 return self._real_extract(url)
388 except GeoRestrictedError as e:
389 if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and
391 self._downloader.params.get('bypass_geo_restriction', True) and
392 not self._x_forwarded_for_ip and
394 self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
395 if self._x_forwarded_for_ip:
397 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
400 except ExtractorError:
402 except compat_http_client.IncompleteRead as e:
403 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
404 except (KeyError, StopIteration) as e:
405 raise ExtractorError('An extractor error has occurred.', cause=e)
407 def set_downloader(self, downloader):
408 """Sets the downloader for this IE."""
409 self._downloader = downloader
411 def _real_initialize(self):
412 """Real initialization process. Redefine in subclasses."""
415 def _real_extract(self, url):
416 """Real extraction process. Redefine in subclasses."""
421 """A string for getting the InfoExtractor with get_info_extractor"""
422 return compat_str(cls.__name__[:-2])
426 return compat_str(type(self).__name__[:-2])
428 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
429 """ Returns the response handle """
431 self.report_download_webpage(video_id)
432 elif note is not False:
434 self.to_screen('%s' % (note,))
436 self.to_screen('%s: %s' % (video_id, note))
437 if isinstance(url_or_request, compat_urllib_request.Request):
438 url_or_request = update_Request(
439 url_or_request, data=data, headers=headers, query=query)
442 url_or_request = update_url_query(url_or_request, query)
443 if data is not None or headers:
444 url_or_request = sanitized_Request(url_or_request, data, headers)
446 return self._downloader.urlopen(url_or_request)
447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
451 errnote = 'Unable to download webpage'
453 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
455 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
457 self._downloader.report_warning(errmsg)
460 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
461 """ Returns a tuple (page content as string, URL handle) """
462 # Strip hashes from the URL (#1038)
463 if isinstance(url_or_request, (compat_str, str)):
464 url_or_request = url_or_request.partition('#')[0]
466 # Some sites check X-Forwarded-For HTTP header in order to figure out
467 # the origin of the client behind proxy. This allows bypassing geo
468 # restriction by faking this header's value to IP that belongs to some
469 # geo unrestricted country. We will do so once we encounter any
470 # geo restriction error.
471 if self._x_forwarded_for_ip:
472 if 'X-Forwarded-For' not in headers:
473 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
475 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
479 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
480 return (content, urlh)
483 def _guess_encoding_from_content(content_type, webpage_bytes):
484 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
486 encoding = m.group(1)
488 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
489 webpage_bytes[:1024])
491 encoding = m.group(1).decode('ascii')
492 elif webpage_bytes.startswith(b'\xff\xfe'):
499 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
500 content_type = urlh.headers.get('Content-Type', '')
501 webpage_bytes = urlh.read()
502 if prefix is not None:
503 webpage_bytes = prefix + webpage_bytes
505 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
506 if self._downloader.params.get('dump_intermediate_pages', False):
508 url = url_or_request.get_full_url()
509 except AttributeError:
511 self.to_screen('Dumping request to ' + url)
512 dump = base64.b64encode(webpage_bytes).decode('ascii')
513 self._downloader.to_screen(dump)
514 if self._downloader.params.get('write_pages', False):
516 url = url_or_request.get_full_url()
517 except AttributeError:
519 basen = '%s_%s' % (video_id, url)
521 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
522 basen = basen[:240 - len(h)] + h
523 raw_filename = basen + '.dump'
524 filename = sanitize_filename(raw_filename, restricted=True)
525 self.to_screen('Saving request to ' + filename)
526 # Working around MAX_PATH limitation on Windows (see
527 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
528 if compat_os_name == 'nt':
529 absfilepath = os.path.abspath(filename)
530 if len(absfilepath) > 259:
531 filename = '\\\\?\\' + absfilepath
532 with open(filename, 'wb') as outf:
533 outf.write(webpage_bytes)
536 content = webpage_bytes.decode(encoding, 'replace')
538 content = webpage_bytes.decode('utf-8', 'replace')
540 if ('<title>Access to this site is blocked</title>' in content and
541 'Websense' in content[:512]):
542 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
543 blocked_iframe = self._html_search_regex(
544 r'<iframe src="([^"]+)"', content,
545 'Websense information URL', default=None)
547 msg += ' Visit %s for more details' % blocked_iframe
548 raise ExtractorError(msg, expected=True)
549 if '<title>The URL you requested has been blocked</title>' in content[:512]:
551 'Access to this webpage has been blocked by Indian censorship. '
552 'Use a VPN or proxy server (with --proxy) to route around it.')
553 block_msg = self._html_search_regex(
554 r'</h1><p>(.*?)</p>',
555 content, 'block message', default=None)
557 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
558 raise ExtractorError(msg, expected=True)
562 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
563 """ Returns the data of the page as a string """
566 while success is False:
568 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
570 except compat_http_client.IncompleteRead as e:
572 if try_count >= tries:
574 self._sleep(timeout, video_id)
581 def _download_xml(self, url_or_request, video_id,
582 note='Downloading XML', errnote='Unable to download XML',
583 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
584 """Return the xml as an xml.etree.ElementTree.Element"""
585 xml_string = self._download_webpage(
586 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
587 if xml_string is False:
590 xml_string = transform_source(xml_string)
591 return compat_etree_fromstring(xml_string.encode('utf-8'))
593 def _download_json(self, url_or_request, video_id,
594 note='Downloading JSON metadata',
595 errnote='Unable to download JSON metadata',
596 transform_source=None,
597 fatal=True, encoding=None, data=None, headers={}, query={}):
598 json_string = self._download_webpage(
599 url_or_request, video_id, note, errnote, fatal=fatal,
600 encoding=encoding, data=data, headers=headers, query=query)
601 if (not fatal) and json_string is False:
603 return self._parse_json(
604 json_string, video_id, transform_source=transform_source, fatal=fatal)
606 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
608 json_string = transform_source(json_string)
610 return json.loads(json_string)
611 except ValueError as ve:
612 errmsg = '%s: Failed to parse JSON ' % video_id
614 raise ExtractorError(errmsg, cause=ve)
616 self.report_warning(errmsg + str(ve))
618 def report_warning(self, msg, video_id=None):
619 idstr = '' if video_id is None else '%s: ' % video_id
620 self._downloader.report_warning(
621 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
623 def to_screen(self, msg):
624 """Print msg to screen, prefixing it with '[ie_name]'"""
625 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
627 def report_extraction(self, id_or_name):
628 """Report information extraction."""
629 self.to_screen('%s: Extracting information' % id_or_name)
631 def report_download_webpage(self, video_id):
632 """Report webpage download."""
633 self.to_screen('%s: Downloading webpage' % video_id)
635 def report_age_confirmation(self):
636 """Report attempt to confirm age."""
637 self.to_screen('Confirming age')
639 def report_login(self):
640 """Report attempt to log in."""
641 self.to_screen('Logging in')
644 def raise_login_required(msg='This video is only available for registered users'):
645 raise ExtractorError(
646 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
650 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
651 raise GeoRestrictedError(msg, countries=countries)
653 # Methods for following #608
655 def url_result(url, ie=None, video_id=None, video_title=None):
656 """Returns a URL that points to a page that should be processed"""
657 # TODO: ie should be the class used for getting the info
658 video_info = {'_type': 'url',
661 if video_id is not None:
662 video_info['id'] = video_id
663 if video_title is not None:
664 video_info['title'] = video_title
668 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
669 """Returns a playlist"""
670 video_info = {'_type': 'playlist',
673 video_info['id'] = playlist_id
675 video_info['title'] = playlist_title
676 if playlist_description:
677 video_info['description'] = playlist_description
680 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
682 Perform a regex search on the given string, using a single or a list of
683 patterns returning the first matching group.
684 In case of failure return a default value or raise a WARNING or a
685 RegexNotFoundError, depending on fatal, specifying the field name.
687 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
688 mobj = re.search(pattern, string, flags)
691 mobj = re.search(p, string, flags)
695 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
696 _name = '\033[0;34m%s\033[0m' % name
702 # return the first matching group
703 return next(g for g in mobj.groups() if g is not None)
705 return mobj.group(group)
706 elif default is not NO_DEFAULT:
709 raise RegexNotFoundError('Unable to extract %s' % _name)
711 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
714 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
716 Like _search_regex, but strips HTML tags and unescapes entities.
718 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
720 return clean_html(res).strip()
724 def _get_netrc_login_info(self, netrc_machine=None):
727 netrc_machine = netrc_machine or self._NETRC_MACHINE
729 if self._downloader.params.get('usenetrc', False):
731 info = netrc.netrc().authenticators(netrc_machine)
736 raise netrc.NetrcParseError(
737 'No authenticators for %s' % netrc_machine)
738 except (IOError, netrc.NetrcParseError) as err:
739 self._downloader.report_warning(
740 'parsing .netrc: %s' % error_to_compat_str(err))
742 return username, password
744 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
746 Get the login info as (username, password)
747 First look for the manually specified credentials using username_option
748 and password_option as keys in params dictionary. If no such credentials
749 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
751 If there's no info available, return (None, None)
753 if self._downloader is None:
756 downloader_params = self._downloader.params
758 # Attempt to use provided username and password or .netrc data
759 if downloader_params.get(username_option) is not None:
760 username = downloader_params[username_option]
761 password = downloader_params[password_option]
763 username, password = self._get_netrc_login_info(netrc_machine)
765 return username, password
767 def _get_tfa_info(self, note='two-factor verification code'):
769 Get the two-factor authentication info
770 TODO - asking the user will be required for sms/phone verify
771 currently just uses the command line option
772 If there's no info available, return None
774 if self._downloader is None:
776 downloader_params = self._downloader.params
778 if downloader_params.get('twofactor') is not None:
779 return downloader_params['twofactor']
781 return compat_getpass('Type %s and press [Return]: ' % note)
783 # Helper functions for extracting OpenGraph info
785 def _og_regexes(prop):
786 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
787 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
788 % {'prop': re.escape(prop)})
789 template = r'<meta[^>]+?%s[^>]+?%s'
791 template % (property_re, content_re),
792 template % (content_re, property_re),
796 def _meta_regex(prop):
797 return r'''(?isx)<meta
798 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
799 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
801 def _og_search_property(self, prop, html, name=None, **kargs):
802 if not isinstance(prop, (list, tuple)):
805 name = 'OpenGraph %s' % prop[0]
808 og_regexes.extend(self._og_regexes(p))
809 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
812 return unescapeHTML(escaped)
814 def _og_search_thumbnail(self, html, **kargs):
815 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
817 def _og_search_description(self, html, **kargs):
818 return self._og_search_property('description', html, fatal=False, **kargs)
820 def _og_search_title(self, html, **kargs):
821 return self._og_search_property('title', html, **kargs)
823 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
824 regexes = self._og_regexes('video') + self._og_regexes('video:url')
826 regexes = self._og_regexes('video:secure_url') + regexes
827 return self._html_search_regex(regexes, html, name, **kargs)
829 def _og_search_url(self, html, **kargs):
830 return self._og_search_property('url', html, **kargs)
832 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
833 if not isinstance(name, (list, tuple)):
835 if display_name is None:
836 display_name = name[0]
837 return self._html_search_regex(
838 [self._meta_regex(n) for n in name],
839 html, display_name, fatal=fatal, group='content', **kwargs)
841 def _dc_search_uploader(self, html):
842 return self._html_search_meta('dc.creator', html, 'uploader')
844 def _rta_search(self, html):
845 # See http://www.rtalabel.org/index.php?content=howtofaq#single
846 if re.search(r'(?ix)<meta\s+name="rating"\s+'
847 r' content="RTA-5042-1996-1400-1577-RTA"',
852 def _media_rating_search(self, html):
853 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
854 rating = self._html_search_meta('rating', html)
866 return RATING_TABLE.get(rating.lower())
868 def _family_friendly_search(self, html):
869 # See http://schema.org/VideoObject
870 family_friendly = self._html_search_meta('isFamilyFriendly', html)
872 if not family_friendly:
881 return RATING_TABLE.get(family_friendly.lower())
883 def _twitter_search_player(self, html):
884 return self._html_search_meta('twitter:player', html,
885 'twitter card player')
887 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
888 json_ld = self._search_regex(
889 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
890 html, 'JSON-LD', group='json_ld', **kwargs)
891 default = kwargs.get('default', NO_DEFAULT)
893 return default if default is not NO_DEFAULT else {}
894 # JSON-LD may be malformed and thus `fatal` should be respected.
895 # At the same time `default` may be passed that assumes `fatal=False`
896 # for _search_regex. Let's simulate the same behavior here as well.
897 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
898 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
900 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
901 if isinstance(json_ld, compat_str):
902 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
906 if not isinstance(json_ld, (list, tuple, dict)):
908 if isinstance(json_ld, dict):
911 if e.get('@context') == 'http://schema.org':
912 item_type = e.get('@type')
913 if expected_type is not None and expected_type != item_type:
915 if item_type == 'TVEpisode':
917 'episode': unescapeHTML(e.get('name')),
918 'episode_number': int_or_none(e.get('episodeNumber')),
919 'description': unescapeHTML(e.get('description')),
921 part_of_season = e.get('partOfSeason')
922 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
923 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
924 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
925 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
926 info['series'] = unescapeHTML(part_of_series.get('name'))
927 elif item_type == 'Article':
929 'timestamp': parse_iso8601(e.get('datePublished')),
930 'title': unescapeHTML(e.get('headline')),
931 'description': unescapeHTML(e.get('articleBody')),
933 elif item_type == 'VideoObject':
935 'url': e.get('contentUrl'),
936 'title': unescapeHTML(e.get('name')),
937 'description': unescapeHTML(e.get('description')),
938 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
939 'duration': parse_duration(e.get('duration')),
940 'timestamp': unified_timestamp(e.get('uploadDate')),
941 'filesize': float_or_none(e.get('contentSize')),
942 'tbr': int_or_none(e.get('bitrate')),
943 'width': int_or_none(e.get('width')),
944 'height': int_or_none(e.get('height')),
947 return dict((k, v) for k, v in info.items() if v is not None)
950 def _hidden_inputs(html):
951 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
953 for input in re.findall(r'(?i)(<input[^>]+>)', html):
954 attrs = extract_attributes(input)
957 if attrs.get('type') not in ('hidden', 'submit'):
959 name = attrs.get('name') or attrs.get('id')
960 value = attrs.get('value')
961 if name and value is not None:
962 hidden_inputs[name] = value
965 def _form_hidden_inputs(self, form_id, html):
966 form = self._search_regex(
967 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
968 html, '%s form' % form_id, group='form')
969 return self._hidden_inputs(form)
971 def _sort_formats(self, formats, field_preference=None):
973 raise ExtractorError('No video formats found')
976 # Automatically determine tbr when missing based on abr and vbr (improves
977 # formats sorting in some cases)
978 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
979 f['tbr'] = f['abr'] + f['vbr']
982 # TODO remove the following workaround
983 from ..utils import determine_ext
984 if not f.get('ext') and 'url' in f:
985 f['ext'] = determine_ext(f['url'])
987 if isinstance(field_preference, (list, tuple)):
990 if f.get(field) is not None
991 else ('' if field == 'format_id' else -1)
992 for field in field_preference)
994 preference = f.get('preference')
995 if preference is None:
997 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1000 protocol = f.get('protocol') or determine_protocol(f)
1001 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1003 if f.get('vcodec') == 'none': # audio only
1005 if self._downloader.params.get('prefer_free_formats'):
1006 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1008 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1011 audio_ext_preference = ORDER.index(f['ext'])
1013 audio_ext_preference = -1
1015 if f.get('acodec') == 'none': # video only
1017 if self._downloader.params.get('prefer_free_formats'):
1018 ORDER = ['flv', 'mp4', 'webm']
1020 ORDER = ['webm', 'flv', 'mp4']
1022 ext_preference = ORDER.index(f['ext'])
1025 audio_ext_preference = 0
1029 f.get('language_preference') if f.get('language_preference') is not None else -1,
1030 f.get('quality') if f.get('quality') is not None else -1,
1031 f.get('tbr') if f.get('tbr') is not None else -1,
1032 f.get('filesize') if f.get('filesize') is not None else -1,
1033 f.get('vbr') if f.get('vbr') is not None else -1,
1034 f.get('height') if f.get('height') is not None else -1,
1035 f.get('width') if f.get('width') is not None else -1,
1038 f.get('abr') if f.get('abr') is not None else -1,
1039 audio_ext_preference,
1040 f.get('fps') if f.get('fps') is not None else -1,
1041 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1042 f.get('source_preference') if f.get('source_preference') is not None else -1,
1043 f.get('format_id') if f.get('format_id') is not None else '',
1045 formats.sort(key=_formats_key)
1047 def _check_formats(self, formats, video_id):
1049 formats[:] = filter(
1050 lambda f: self._is_valid_url(
1052 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1056 def _remove_duplicate_formats(formats):
1060 if f['url'] not in format_urls:
1061 format_urls.add(f['url'])
1062 unique_formats.append(f)
1063 formats[:] = unique_formats
1065 def _is_valid_url(self, url, video_id, item='video', headers={}):
1066 url = self._proto_relative_url(url, scheme='http:')
1067 # For now assume non HTTP(S) URLs always valid
1068 if not (url.startswith('http://') or url.startswith('https://')):
1071 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1073 except ExtractorError as e:
1074 if isinstance(e.cause, compat_urllib_error.URLError):
1076 '%s: %s URL is invalid, skipping' % (video_id, item))
1080 def http_scheme(self):
1081 """ Either "http:" or "https:", depending on the user's preferences """
1084 if self._downloader.params.get('prefer_insecure', False)
1087 def _proto_relative_url(self, url, scheme=None):
1090 if url.startswith('//'):
1092 scheme = self.http_scheme()
1097 def _sleep(self, timeout, video_id, msg_template=None):
1098 if msg_template is None:
1099 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1100 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1104 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1105 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1106 fatal=True, m3u8_id=None):
1107 manifest = self._download_xml(
1108 manifest_url, video_id, 'Downloading f4m manifest',
1109 'Unable to download f4m manifest',
1110 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1111 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1112 transform_source=transform_source,
1115 if manifest is False:
1118 return self._parse_f4m_formats(
1119 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1120 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1122 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1123 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1124 fatal=True, m3u8_id=None):
1125 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1126 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1127 if akamai_pv is not None and ';' in akamai_pv.text:
1128 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1129 if playerVerificationChallenge.strip() != '':
1133 manifest_version = '1.0'
1134 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1136 manifest_version = '2.0'
1137 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1138 # Remove unsupported DRM protected media from final formats
1139 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1140 media_nodes = remove_encrypted_media(media_nodes)
1143 base_url = xpath_text(
1144 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1145 'base URL', default=None)
1147 base_url = base_url.strip()
1149 bootstrap_info = xpath_element(
1150 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1151 'bootstrap info', default=None)
1154 mime_type = xpath_text(
1155 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1156 'base URL', default=None)
1157 if mime_type and mime_type.startswith('audio/'):
1160 for i, media_el in enumerate(media_nodes):
1161 tbr = int_or_none(media_el.attrib.get('bitrate'))
1162 width = int_or_none(media_el.attrib.get('width'))
1163 height = int_or_none(media_el.attrib.get('height'))
1164 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1165 # If <bootstrapInfo> is present, the specified f4m is a
1166 # stream-level manifest, and only set-level manifests may refer to
1167 # external resources. See section 11.4 and section 4 of F4M spec
1168 if bootstrap_info is None:
1170 # @href is introduced in 2.0, see section 11.6 of F4M spec
1171 if manifest_version == '2.0':
1172 media_url = media_el.attrib.get('href')
1173 if media_url is None:
1174 media_url = media_el.attrib.get('url')
1178 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1179 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1180 # If media_url is itself a f4m manifest do the recursive extraction
1181 # since bitrates in parent manifest (this one) and media_url manifest
1182 # may differ leading to inability to resolve the format by requested
1183 # bitrate in f4m downloader
1184 ext = determine_ext(manifest_url)
1186 f4m_formats = self._extract_f4m_formats(
1187 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1188 transform_source=transform_source, fatal=fatal)
1189 # Sometimes stream-level manifest contains single media entry that
1190 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1191 # At the same time parent's media entry in set-level manifest may
1192 # contain it. We will copy it from parent in such cases.
1193 if len(f4m_formats) == 1:
1196 'tbr': f.get('tbr') or tbr,
1197 'width': f.get('width') or width,
1198 'height': f.get('height') or height,
1199 'format_id': f.get('format_id') if not tbr else format_id,
1202 formats.extend(f4m_formats)
1205 formats.extend(self._extract_m3u8_formats(
1206 manifest_url, video_id, 'mp4', preference=preference,
1207 m3u8_id=m3u8_id, fatal=fatal))
1210 'format_id': format_id,
1211 'url': manifest_url,
1212 'manifest_url': manifest_url,
1213 'ext': 'flv' if bootstrap_info is not None else None,
1218 'preference': preference,
1222 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1224 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1228 'preference': preference - 100 if preference else -100,
1229 'resolution': 'multiple',
1230 'format_note': 'Quality selection URL',
1233 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1234 entry_protocol='m3u8', preference=None,
1235 m3u8_id=None, note=None, errnote=None,
1236 fatal=True, live=False):
1238 res = self._download_webpage_handle(
1240 note=note or 'Downloading m3u8 information',
1241 errnote=errnote or 'Failed to download m3u8 information',
1245 m3u8_doc, urlh = res
1246 m3u8_url = urlh.geturl()
1248 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1251 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1253 format_url = lambda u: (
1255 if re.match(r'^https?://', u)
1256 else compat_urlparse.urljoin(m3u8_url, u))
1258 # We should try extracting formats only from master playlists [1], i.e.
1259 # playlists that describe available qualities. On the other hand media
1260 # playlists [2] should be returned as is since they contain just the media
1261 # without qualities renditions.
1262 # Fortunately, master playlist can be easily distinguished from media
1263 # playlist based on particular tags availability. As of [1, 2] master
1264 # playlist tags MUST NOT appear in a media playist and vice versa.
1265 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1266 # and MUST NOT appear in master playlist thus we can clearly detect media
1267 # playlist with this criterion.
1268 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1269 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1270 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1271 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1274 'format_id': m3u8_id,
1276 'protocol': entry_protocol,
1277 'preference': preference,
1279 audio_in_video_stream = {}
1282 for line in m3u8_doc.splitlines():
1283 if line.startswith('#EXT-X-STREAM-INF:'):
1284 last_info = parse_m3u8_attributes(line)
1285 elif line.startswith('#EXT-X-MEDIA:'):
1286 media = parse_m3u8_attributes(line)
1287 media_type = media.get('TYPE')
1288 if media_type in ('VIDEO', 'AUDIO'):
1289 group_id = media.get('GROUP-ID')
1290 media_url = media.get('URI')
1293 for v in (group_id, media.get('NAME')):
1297 'format_id': '-'.join(format_id),
1298 'url': format_url(media_url),
1299 'language': media.get('LANGUAGE'),
1301 'protocol': entry_protocol,
1302 'preference': preference,
1304 if media_type == 'AUDIO':
1305 f['vcodec'] = 'none'
1306 if group_id and not audio_in_video_stream.get(group_id):
1307 audio_in_video_stream[group_id] = False
1310 # When there is no URI in EXT-X-MEDIA let this tag's
1311 # data be used by regular URI lines below
1313 if media_type == 'AUDIO' and group_id:
1314 audio_in_video_stream[group_id] = True
1315 elif line.startswith('#') or not line.strip():
1318 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1321 format_id.append(m3u8_id)
1322 # Despite specification does not mention NAME attribute for
1323 # EXT-X-STREAM-INF it still sometimes may be present
1324 stream_name = last_info.get('NAME') or last_media.get('NAME')
1325 # Bandwidth of live streams may differ over time thus making
1326 # format_id unpredictable. So it's better to keep provided
1329 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1330 manifest_url = format_url(line.strip())
1332 'format_id': '-'.join(format_id),
1333 'url': manifest_url,
1334 'manifest_url': manifest_url,
1337 'fps': float_or_none(last_info.get('FRAME-RATE')),
1338 'protocol': entry_protocol,
1339 'preference': preference,
1341 resolution = last_info.get('RESOLUTION')
1343 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1345 f['width'] = int(mobj.group('width'))
1346 f['height'] = int(mobj.group('height'))
1347 # Unified Streaming Platform
1349 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1351 abr, vbr = mobj.groups()
1352 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1357 f.update(parse_codecs(last_info.get('CODECS')))
1358 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1359 # TODO: update acodec for audio only formats with the same GROUP-ID
1360 f['acodec'] = 'none'
1367 def _xpath_ns(path, namespace=None):
1371 for c in path.split('/'):
1372 if not c or c == '.':
1375 out.append('{%s}%s' % (namespace, c))
1376 return '/'.join(out)
1378 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1379 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1385 namespace = self._parse_smil_namespace(smil)
1387 return self._parse_smil_formats(
1388 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1390 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1391 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1394 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1396 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1397 return self._download_xml(
1398 smil_url, video_id, 'Downloading SMIL file',
1399 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1401 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1402 namespace = self._parse_smil_namespace(smil)
1404 formats = self._parse_smil_formats(
1405 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1406 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1408 video_id = os.path.splitext(url_basename(smil_url))[0]
1412 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1413 name = meta.attrib.get('name')
1414 content = meta.attrib.get('content')
1415 if not name or not content:
1417 if not title and name == 'title':
1419 elif not description and name in ('description', 'abstract'):
1420 description = content
1421 elif not upload_date and name == 'date':
1422 upload_date = unified_strdate(content)
1425 'id': image.get('type'),
1426 'url': image.get('src'),
1427 'width': int_or_none(image.get('width')),
1428 'height': int_or_none(image.get('height')),
1429 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1433 'title': title or video_id,
1434 'description': description,
1435 'upload_date': upload_date,
1436 'thumbnails': thumbnails,
1438 'subtitles': subtitles,
1441 def _parse_smil_namespace(self, smil):
1442 return self._search_regex(
1443 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1445 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1447 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1448 b = meta.get('base') or meta.get('httpBase')
1459 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1460 for medium in media:
1461 src = medium.get('src')
1462 if not src or src in srcs:
1466 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1467 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1468 width = int_or_none(medium.get('width'))
1469 height = int_or_none(medium.get('height'))
1470 proto = medium.get('proto')
1471 ext = medium.get('ext')
1472 src_ext = determine_ext(src)
1473 streamer = medium.get('streamer') or base
1475 if proto == 'rtmp' or streamer.startswith('rtmp'):
1481 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1483 'filesize': filesize,
1487 if transform_rtmp_url:
1488 streamer, src = transform_rtmp_url(streamer, src)
1489 formats[-1].update({
1495 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1496 src_url = src_url.strip()
1498 if proto == 'm3u8' or src_ext == 'm3u8':
1499 m3u8_formats = self._extract_m3u8_formats(
1500 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1501 if len(m3u8_formats) == 1:
1503 m3u8_formats[0].update({
1504 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1509 formats.extend(m3u8_formats)
1512 if src_ext == 'f4m':
1517 'plugin': 'flowplayer-3.2.0.1',
1519 f4m_url += '&' if '?' in f4m_url else '?'
1520 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1521 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1524 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1528 'ext': ext or src_ext or 'flv',
1529 'format_id': 'http-%d' % (bitrate or http_count),
1531 'filesize': filesize,
1539 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1542 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1543 src = textstream.get('src')
1544 if not src or src in urls:
1547 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1548 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1549 subtitles.setdefault(lang, []).append({
1555 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1556 xspf = self._download_xml(
1557 playlist_url, playlist_id, 'Downloading xpsf playlist',
1558 'Unable to download xspf manifest', fatal=fatal)
1561 return self._parse_xspf(xspf, playlist_id)
1563 def _parse_xspf(self, playlist, playlist_id):
1565 'xspf': 'http://xspf.org/ns/0/',
1566 's1': 'http://static.streamone.nl/player/ns/0',
1570 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1572 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1573 description = xpath_text(
1574 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1575 thumbnail = xpath_text(
1576 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1577 duration = float_or_none(
1578 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1581 'url': location.text,
1582 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1583 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1584 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1585 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1586 self._sort_formats(formats)
1591 'description': description,
1592 'thumbnail': thumbnail,
1593 'duration': duration,
1598 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1599 res = self._download_webpage_handle(
1601 note=note or 'Downloading MPD manifest',
1602 errnote=errnote or 'Failed to download MPD manifest',
1607 mpd_base_url = base_url(urlh.geturl())
1609 return self._parse_mpd_formats(
1610 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1611 formats_dict=formats_dict, mpd_url=mpd_url)
1613 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1615 Parse formats from MPD manifest.
1617 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1618 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1619 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1621 if mpd_doc.get('type') == 'dynamic':
1624 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1627 return self._xpath_ns(path, namespace)
1629 def is_drm_protected(element):
1630 return element.find(_add_ns('ContentProtection')) is not None
1632 def extract_multisegment_info(element, ms_parent_info):
1633 ms_info = ms_parent_info.copy()
1635 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1636 # common attributes and elements. We will only extract relevant
1638 def extract_common(source):
1639 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1640 if segment_timeline is not None:
1641 s_e = segment_timeline.findall(_add_ns('S'))
1643 ms_info['total_number'] = 0
1646 r = int(s.get('r', 0))
1647 ms_info['total_number'] += 1 + r
1648 ms_info['s'].append({
1649 't': int(s.get('t', 0)),
1650 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1651 'd': int(s.attrib['d']),
1654 start_number = source.get('startNumber')
1656 ms_info['start_number'] = int(start_number)
1657 timescale = source.get('timescale')
1659 ms_info['timescale'] = int(timescale)
1660 segment_duration = source.get('duration')
1661 if segment_duration:
1662 ms_info['segment_duration'] = int(segment_duration)
1664 def extract_Initialization(source):
1665 initialization = source.find(_add_ns('Initialization'))
1666 if initialization is not None:
1667 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1669 segment_list = element.find(_add_ns('SegmentList'))
1670 if segment_list is not None:
1671 extract_common(segment_list)
1672 extract_Initialization(segment_list)
1673 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1675 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1677 segment_template = element.find(_add_ns('SegmentTemplate'))
1678 if segment_template is not None:
1679 extract_common(segment_template)
1680 media = segment_template.get('media')
1682 ms_info['media'] = media
1683 initialization = segment_template.get('initialization')
1685 ms_info['initialization'] = initialization
1687 extract_Initialization(segment_template)
1690 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1692 for period in mpd_doc.findall(_add_ns('Period')):
1693 period_duration = parse_duration(period.get('duration')) or mpd_duration
1694 period_ms_info = extract_multisegment_info(period, {
1698 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1699 if is_drm_protected(adaptation_set):
1701 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1702 for representation in adaptation_set.findall(_add_ns('Representation')):
1703 if is_drm_protected(representation):
1705 representation_attrib = adaptation_set.attrib.copy()
1706 representation_attrib.update(representation.attrib)
1707 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1708 mime_type = representation_attrib['mimeType']
1709 content_type = mime_type.split('/')[0]
1710 if content_type == 'text':
1711 # TODO implement WebVTT downloading
1713 elif content_type == 'video' or content_type == 'audio':
1715 for element in (representation, adaptation_set, period, mpd_doc):
1716 base_url_e = element.find(_add_ns('BaseURL'))
1717 if base_url_e is not None:
1718 base_url = base_url_e.text + base_url
1719 if re.match(r'^https?://', base_url):
1721 if mpd_base_url and not re.match(r'^https?://', base_url):
1722 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1724 base_url = mpd_base_url + base_url
1725 representation_id = representation_attrib.get('id')
1726 lang = representation_attrib.get('lang')
1727 url_el = representation.find(_add_ns('BaseURL'))
1728 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1729 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1731 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1733 'manifest_url': mpd_url,
1734 'ext': mimetype2ext(mime_type),
1735 'width': int_or_none(representation_attrib.get('width')),
1736 'height': int_or_none(representation_attrib.get('height')),
1737 'tbr': int_or_none(bandwidth, 1000),
1738 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1739 'fps': int_or_none(representation_attrib.get('frameRate')),
1740 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1741 'format_note': 'DASH %s' % content_type,
1742 'filesize': filesize,
1744 f.update(parse_codecs(representation_attrib.get('codecs')))
1745 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1747 def prepare_template(template_name, identifiers):
1748 t = representation_ms_info[template_name]
1749 t = t.replace('$RepresentationID$', representation_id)
1750 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1751 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1752 t.replace('$$', '$')
1755 # @initialization is a regular template like @media one
1756 # so it should be handled just the same way (see
1757 # https://github.com/rg3/youtube-dl/issues/11605)
1758 if 'initialization' in representation_ms_info:
1759 initialization_template = prepare_template(
1761 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1762 # $Time$ shall not be included for @initialization thus
1763 # only $Bandwidth$ remains
1765 representation_ms_info['initialization_url'] = initialization_template % {
1766 'Bandwidth': bandwidth,
1769 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1771 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1773 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1774 # can't be used at the same time
1775 if '%(Number' in media_template and 's' not in representation_ms_info:
1776 segment_duration = None
1777 if 'total_number' not in representation_ms_info and 'segment_duration':
1778 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1779 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1780 representation_ms_info['fragments'] = [{
1781 'url': media_template % {
1782 'Number': segment_number,
1783 'Bandwidth': bandwidth,
1785 'duration': segment_duration,
1786 } for segment_number in range(
1787 representation_ms_info['start_number'],
1788 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1790 # $Number*$ or $Time$ in media template with S list available
1791 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1792 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1793 representation_ms_info['fragments'] = []
1796 segment_number = representation_ms_info['start_number']
1798 def add_segment_url():
1799 segment_url = media_template % {
1800 'Time': segment_time,
1801 'Bandwidth': bandwidth,
1802 'Number': segment_number,
1804 representation_ms_info['fragments'].append({
1806 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1809 for num, s in enumerate(representation_ms_info['s']):
1810 segment_time = s.get('t') or segment_time
1814 for r in range(s.get('r', 0)):
1815 segment_time += segment_d
1818 segment_time += segment_d
1819 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1821 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1822 # or any YouTube dashsegments video
1825 timescale = representation_ms_info['timescale']
1826 for s in representation_ms_info['s']:
1827 duration = float_or_none(s['d'], timescale)
1828 for r in range(s.get('r', 0) + 1):
1830 'url': representation_ms_info['segment_urls'][segment_index],
1831 'duration': duration,
1834 representation_ms_info['fragments'] = fragments
1835 # NB: MPD manifest may contain direct URLs to unfragmented media.
1836 # No fragments key is present in this case.
1837 if 'fragments' in representation_ms_info:
1840 'protocol': 'http_dash_segments',
1842 if 'initialization_url' in representation_ms_info:
1843 initialization_url = representation_ms_info['initialization_url']
1844 if not f.get('url'):
1845 f['url'] = initialization_url
1846 f['fragments'].append({'url': initialization_url})
1847 f['fragments'].extend(representation_ms_info['fragments'])
1848 for fragment in f['fragments']:
1849 fragment['url'] = urljoin(base_url, fragment['url'])
1851 existing_format = next(
1852 fo for fo in formats
1853 if fo['format_id'] == representation_id)
1854 except StopIteration:
1855 full_info = formats_dict.get(representation_id, {}).copy()
1857 formats.append(full_info)
1859 existing_format.update(f)
1861 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1864 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1865 res = self._download_webpage_handle(
1867 note=note or 'Downloading ISM manifest',
1868 errnote=errnote or 'Failed to download ISM manifest',
1874 return self._parse_ism_formats(
1875 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1877 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1878 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1881 duration = int(ism_doc.attrib['Duration'])
1882 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1885 for stream in ism_doc.findall('StreamIndex'):
1886 stream_type = stream.get('Type')
1887 if stream_type not in ('video', 'audio'):
1889 url_pattern = stream.attrib['Url']
1890 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1891 stream_name = stream.get('Name')
1892 for track in stream.findall('QualityLevel'):
1893 fourcc = track.get('FourCC')
1894 # TODO: add support for WVC1 and WMAP
1895 if fourcc not in ('H264', 'AVC1', 'AACL'):
1896 self.report_warning('%s is not a supported codec' % fourcc)
1898 tbr = int(track.attrib['Bitrate']) // 1000
1899 width = int_or_none(track.get('MaxWidth'))
1900 height = int_or_none(track.get('MaxHeight'))
1901 sampling_rate = int_or_none(track.get('SamplingRate'))
1903 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1904 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1910 stream_fragments = stream.findall('c')
1911 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1912 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1913 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1914 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1915 if not fragment_ctx['duration']:
1917 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1919 next_fragment_time = duration
1920 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1921 for _ in range(fragment_repeat):
1923 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1924 'duration': fragment_ctx['duration'] / stream_timescale,
1926 fragment_ctx['time'] += fragment_ctx['duration']
1930 format_id.append(ism_id)
1932 format_id.append(stream_name)
1933 format_id.append(compat_str(tbr))
1936 'format_id': '-'.join(format_id),
1938 'manifest_url': ism_url,
1939 'ext': 'ismv' if stream_type == 'video' else 'isma',
1943 'asr': sampling_rate,
1944 'vcodec': 'none' if stream_type == 'audio' else fourcc,
1945 'acodec': 'none' if stream_type == 'video' else fourcc,
1947 'fragments': fragments,
1948 '_download_params': {
1949 'duration': duration,
1950 'timescale': stream_timescale,
1951 'width': width or 0,
1952 'height': height or 0,
1954 'codec_private_data': track.get('CodecPrivateData'),
1955 'sampling_rate': sampling_rate,
1956 'channels': int_or_none(track.get('Channels', 2)),
1957 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1958 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1963 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1964 def absolute_url(video_url):
1965 return compat_urlparse.urljoin(base_url, video_url)
1967 def parse_content_type(content_type):
1968 if not content_type:
1970 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1972 mimetype, codecs = ctr.groups()
1973 f = parse_codecs(codecs)
1974 f['ext'] = mimetype2ext(mimetype)
1978 def _media_formats(src, cur_media_type):
1979 full_url = absolute_url(src)
1980 ext = determine_ext(full_url)
1982 is_plain_url = False
1983 formats = self._extract_m3u8_formats(
1984 full_url, video_id, ext='mp4',
1985 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1987 is_plain_url = False
1988 formats = self._extract_mpd_formats(
1989 full_url, video_id, mpd_id=mpd_id)
1994 'vcodec': 'none' if cur_media_type == 'audio' else None,
1996 return is_plain_url, formats
1999 media_tags = [(media_tag, media_type, '')
2000 for media_tag, media_type
2001 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2002 media_tags.extend(re.findall(
2003 # We only allow video|audio followed by a whitespace or '>'.
2004 # Allowing more characters may end up in significant slow down (see
2005 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2006 # http://www.porntrex.com/maps/videositemap.xml).
2007 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2008 for media_tag, media_type, media_content in media_tags:
2013 media_attributes = extract_attributes(media_tag)
2014 src = media_attributes.get('src')
2016 _, formats = _media_formats(src, media_type)
2017 media_info['formats'].extend(formats)
2018 media_info['thumbnail'] = media_attributes.get('poster')
2020 for source_tag in re.findall(r'<source[^>]+>', media_content):
2021 source_attributes = extract_attributes(source_tag)
2022 src = source_attributes.get('src')
2025 is_plain_url, formats = _media_formats(src, media_type)
2027 f = parse_content_type(source_attributes.get('type'))
2028 f.update(formats[0])
2029 media_info['formats'].append(f)
2031 media_info['formats'].extend(formats)
2032 for track_tag in re.findall(r'<track[^>]+>', media_content):
2033 track_attributes = extract_attributes(track_tag)
2034 kind = track_attributes.get('kind')
2035 if not kind or kind in ('subtitles', 'captions'):
2036 src = track_attributes.get('src')
2039 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2040 media_info['subtitles'].setdefault(lang, []).append({
2041 'url': absolute_url(src),
2043 if media_info['formats'] or media_info['subtitles']:
2044 entries.append(media_info)
2047 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2049 hdcore_sign = 'hdcore=3.7.0'
2050 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2051 hds_host = hosts.get('hds')
2053 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2054 if 'hdcore=' not in f4m_url:
2055 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2056 f4m_formats = self._extract_f4m_formats(
2057 f4m_url, video_id, f4m_id='hds', fatal=False)
2058 for entry in f4m_formats:
2059 entry.update({'extra_param_to_segment_url': hdcore_sign})
2060 formats.extend(f4m_formats)
2061 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2062 hls_host = hosts.get('hls')
2064 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2065 formats.extend(self._extract_m3u8_formats(
2066 m3u8_url, video_id, 'mp4', 'm3u8_native',
2067 m3u8_id='hls', fatal=False))
2070 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2071 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2072 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2073 http_base_url = 'http' + url_base
2075 if 'm3u8' not in skip_protocols:
2076 formats.extend(self._extract_m3u8_formats(
2077 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2078 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2079 if 'f4m' not in skip_protocols:
2080 formats.extend(self._extract_f4m_formats(
2081 http_base_url + '/manifest.f4m',
2082 video_id, f4m_id='hds', fatal=False))
2083 if 'dash' not in skip_protocols:
2084 formats.extend(self._extract_mpd_formats(
2085 http_base_url + '/manifest.mpd',
2086 video_id, mpd_id='dash', fatal=False))
2087 if re.search(r'(?:/smil:|\.smil)', url_base):
2088 if 'smil' not in skip_protocols:
2089 rtmp_formats = self._extract_smil_formats(
2090 http_base_url + '/jwplayer.smil',
2091 video_id, fatal=False)
2092 for rtmp_format in rtmp_formats:
2093 rtsp_format = rtmp_format.copy()
2094 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2095 del rtsp_format['play_path']
2096 del rtsp_format['ext']
2097 rtsp_format.update({
2098 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2099 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2102 formats.extend([rtmp_format, rtsp_format])
2104 for protocol in ('rtmp', 'rtsp'):
2105 if protocol not in skip_protocols:
2107 'url': protocol + url_base,
2108 'format_id': protocol,
2109 'protocol': protocol,
2114 def _find_jwplayer_data(webpage):
2116 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2119 return mobj.group('options')
2121 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2122 jwplayer_data = self._parse_json(
2123 self._find_jwplayer_data(webpage), video_id,
2124 transform_source=js_to_json)
2125 return self._parse_jwplayer_data(
2126 jwplayer_data, video_id, *args, **kwargs)
2128 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2129 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2130 # JWPlayer backward compatibility: flattened playlists
2131 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2132 if 'playlist' not in jwplayer_data:
2133 jwplayer_data = {'playlist': [jwplayer_data]}
2137 # JWPlayer backward compatibility: single playlist item
2138 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2139 if not isinstance(jwplayer_data['playlist'], list):
2140 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2142 for video_data in jwplayer_data['playlist']:
2143 # JWPlayer backward compatibility: flattened sources
2144 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2145 if 'sources' not in video_data:
2146 video_data['sources'] = [video_data]
2148 this_video_id = video_id or video_data['mediaid']
2151 for source in video_data['sources']:
2152 source_url = self._proto_relative_url(source['file'])
2154 source_url = compat_urlparse.urljoin(base_url, source_url)
2155 source_type = source.get('type') or ''
2156 ext = mimetype2ext(source_type) or determine_ext(source_url)
2157 if source_type == 'hls' or ext == 'm3u8':
2158 formats.extend(self._extract_m3u8_formats(
2159 source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2161 formats.extend(self._extract_mpd_formats(
2162 source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2163 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2164 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2171 height = int_or_none(source.get('height'))
2173 # Often no height is provided but there is a label in
2174 # format like 1080p.
2175 height = int_or_none(self._search_regex(
2176 r'^(\d{3,})[pP]$', source.get('label') or '',
2177 'height', default=None))
2180 'width': int_or_none(source.get('width')),
2184 if source_url.startswith('rtmp'):
2185 a_format['ext'] = 'flv'
2187 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2188 # of jwplayer.flash.swf
2189 rtmp_url_parts = re.split(
2190 r'((?:mp4|mp3|flv):)', source_url, 1)
2191 if len(rtmp_url_parts) == 3:
2192 rtmp_url, prefix, play_path = rtmp_url_parts
2195 'play_path': prefix + play_path,
2198 a_format.update(rtmp_params)
2199 formats.append(a_format)
2200 self._sort_formats(formats)
2203 tracks = video_data.get('tracks')
2204 if tracks and isinstance(tracks, list):
2205 for track in tracks:
2206 if track.get('kind') != 'captions':
2208 track_url = urljoin(base_url, track.get('file'))
2211 subtitles.setdefault(track.get('label') or 'en', []).append({
2212 'url': self._proto_relative_url(track_url)
2216 'id': this_video_id,
2217 'title': video_data['title'] if require_title else video_data.get('title'),
2218 'description': video_data.get('description'),
2219 'thumbnail': self._proto_relative_url(video_data.get('image')),
2220 'timestamp': int_or_none(video_data.get('pubdate')),
2221 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2222 'subtitles': subtitles,
2225 if len(entries) == 1:
2228 return self.playlist_result(entries)
2230 def _live_title(self, name):
2231 """ Generate the title for a live video """
2232 now = datetime.datetime.now()
2233 now_str = now.strftime('%Y-%m-%d %H:%M')
2234 return name + ' ' + now_str
2236 def _int(self, v, name, fatal=False, **kwargs):
2237 res = int_or_none(v, **kwargs)
2238 if 'get_attr' in kwargs:
2239 print(getattr(v, kwargs['get_attr']))
2241 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2243 raise ExtractorError(msg)
2245 self._downloader.report_warning(msg)
2248 def _float(self, v, name, fatal=False, **kwargs):
2249 res = float_or_none(v, **kwargs)
2251 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2253 raise ExtractorError(msg)
2255 self._downloader.report_warning(msg)
2258 def _set_cookie(self, domain, name, value, expire_time=None):
2259 cookie = compat_cookiejar.Cookie(
2260 0, name, value, None, None, domain, None,
2261 None, '/', True, False, expire_time, '', None, None, None)
2262 self._downloader.cookiejar.set_cookie(cookie)
2264 def _get_cookies(self, url):
2265 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2266 req = sanitized_Request(url)
2267 self._downloader.cookiejar.add_cookie_header(req)
2268 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2270 def get_testcases(self, include_onlymatching=False):
2271 t = getattr(self, '_TEST', None)
2273 assert not hasattr(self, '_TESTS'), \
2274 '%s has _TEST and _TESTS' % type(self).__name__
2277 tests = getattr(self, '_TESTS', [])
2279 if not include_onlymatching and t.get('only_matching', False):
2281 t['name'] = type(self).__name__[:-len('IE')]
2284 def is_suitable(self, age_limit):
2285 """ Test whether the extractor is generally suitable for the given
2286 age limit (i.e. pornographic sites are not, all others usually are) """
2288 any_restricted = False
2289 for tc in self.get_testcases(include_onlymatching=False):
2290 if tc.get('playlist', []):
2291 tc = tc['playlist'][0]
2292 is_restricted = age_restricted(
2293 tc.get('info_dict', {}).get('age_limit'), age_limit)
2294 if not is_restricted:
2296 any_restricted = any_restricted or is_restricted
2297 return not any_restricted
2299 def extract_subtitles(self, *args, **kwargs):
2300 if (self._downloader.params.get('writesubtitles', False) or
2301 self._downloader.params.get('listsubtitles')):
2302 return self._get_subtitles(*args, **kwargs)
2305 def _get_subtitles(self, *args, **kwargs):
2306 raise NotImplementedError('This method must be implemented by subclasses')
2309 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2310 """ Merge subtitle items for one language. Items with duplicated URLs
2311 will be dropped. """
2312 list1_urls = set([item['url'] for item in subtitle_list1])
2313 ret = list(subtitle_list1)
2314 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2318 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2319 """ Merge two subtitle dictionaries, language by language. """
2320 ret = dict(subtitle_dict1)
2321 for lang in subtitle_dict2:
2322 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2325 def extract_automatic_captions(self, *args, **kwargs):
2326 if (self._downloader.params.get('writeautomaticsub', False) or
2327 self._downloader.params.get('listsubtitles')):
2328 return self._get_automatic_captions(*args, **kwargs)
2331 def _get_automatic_captions(self, *args, **kwargs):
2332 raise NotImplementedError('This method must be implemented by subclasses')
2334 def mark_watched(self, *args, **kwargs):
2335 if (self._downloader.params.get('mark_watched', False) and
2336 (self._get_login_info()[0] is not None or
2337 self._downloader.params.get('cookiefile') is not None)):
2338 self._mark_watched(*args, **kwargs)
2340 def _mark_watched(self, *args, **kwargs):
2341 raise NotImplementedError('This method must be implemented by subclasses')
2343 def geo_verification_headers(self):
2345 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2346 if geo_verification_proxy:
2347 headers['Ytdl-request-proxy'] = geo_verification_proxy
2350 def _generic_id(self, url):
2351 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2353 def _generic_title(self, url):
2354 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2357 class SearchInfoExtractor(InfoExtractor):
2359 Base class for paged search queries extractors.
2360 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2361 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2365 def _make_valid_url(cls):
2366 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2369 def suitable(cls, url):
2370 return re.match(cls._make_valid_url(), url) is not None
2372 def _real_extract(self, query):
2373 mobj = re.match(self._make_valid_url(), query)
2375 raise ExtractorError('Invalid search query "%s"' % query)
2377 prefix = mobj.group('prefix')
2378 query = mobj.group('query')
2380 return self._get_n_results(query, 1)
2381 elif prefix == 'all':
2382 return self._get_n_results(query, self._MAX_RESULTS)
2386 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2387 elif n > self._MAX_RESULTS:
2388 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2389 n = self._MAX_RESULTS
2390 return self._get_n_results(query, n)
2392 def _get_n_results(self, query, n):
2393 """Get a specified number of results for a query"""
2394 raise NotImplementedError('This method must be implemented by subclasses')
2397 def SEARCH_KEY(self):
2398 return self._SEARCH_KEY