1 from __future__ import unicode_literals
16 from ..compat import (
19 compat_etree_fromstring,
25 compat_urllib_parse_unquote,
26 compat_urllib_parse_urlencode,
27 compat_urllib_request,
30 from ..downloader.f4m import remove_encrypted_media
63 parse_m3u8_attributes,
70 class InfoExtractor(object):
71 """Information Extractor class.
73 Information extractors are the classes that, given a URL, extract
74 information about the video (or videos) the URL refers to. This
75 information includes the real video URL, the video title, author and
76 others. The information is stored in a dictionary which is then
77 passed to the YoutubeDL. The YoutubeDL processes this
78 information possibly downloading the video to the file system, among
79 other possible outcomes.
81 The type field determines the type of the result.
82 By far the most common value (and the default if _type is missing) is
83 "video", which indicates a single video.
85 For a video, the dictionaries must include the following fields:
88 title: Video title, unescaped.
90 Additionally, it must contain either a formats entry or a url one:
92 formats: A list of dictionaries for each format available, ordered
93 from worst to best quality.
96 * url Mandatory. The URL of the video file
98 The URL of the manifest file in case of
99 fragmented media (DASH, hls, hds)
100 * ext Will be calculated from URL if missing
101 * format A human-readable description of the format
102 ("mp4 container with h264/opus").
103 Calculated from the format_id, width, height.
104 and format_note fields if missing.
105 * format_id A short description of the format
106 ("mp4_h264_opus" or "19").
107 Technically optional, but strongly recommended.
108 * format_note Additional info about the format
109 ("3D" or "DASH video")
110 * width Width of the video, if known
111 * height Height of the video, if known
112 * resolution Textual description of width and height
113 * tbr Average bitrate of audio and video in KBit/s
114 * abr Average audio bitrate in KBit/s
115 * acodec Name of the audio codec in use
116 * asr Audio sampling rate in Hertz
117 * vbr Average video bitrate in KBit/s
119 * vcodec Name of the video codec in use
120 * container Name of the container format
121 * filesize The number of bytes, if known in advance
122 * filesize_approx An estimate for the number of bytes
123 * player_url SWF Player URL (used for rtmpdump).
124 * protocol The protocol that will be used for the actual
125 download, lower-case.
126 "http", "https", "rtsp", "rtmp", "rtmpe",
127 "m3u8", "m3u8_native" or "http_dash_segments".
129 Base URL for fragments. Each fragment's path
130 value (if present) will be relative to
132 * fragments A list of fragments of a fragmented media.
133 Each fragment entry must contain either an url
134 or a path. If an url is present it should be
135 considered by a client. Otherwise both path and
136 fragment_base_url must be present. Here is
137 the list of all potential fields:
138 * "url" - fragment's URL
139 * "path" - fragment's path relative to
141 * "duration" (optional, int or float)
142 * "filesize" (optional, int)
143 * preference Order number of this format. If this field is
144 present and not None, the formats get sorted
145 by this field, regardless of all other values.
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 < -1000 to hide the format (if there is
149 another one which is strictly better)
150 * language Language code, e.g. "de" or "en-US".
151 * language_preference Is this in the language mentioned in
153 10 if it's what the URL is about,
154 -1 for default (don't know),
155 -10 otherwise, other values reserved for now.
156 * quality Order number of the video quality of this
157 format, irrespective of the file format.
158 -1 for default (order by other properties),
159 -2 or smaller for less than default.
160 * source_preference Order number for this video source
161 (quality takes higher priority)
162 -1 for default (order by other properties),
163 -2 or smaller for less than default.
164 * http_headers A dictionary of additional HTTP headers
165 to add to the request.
166 * stretched_ratio If given and not 1, indicates that the
167 video's pixels are not square.
168 width : height ratio as float.
169 * no_resume The server does not support resuming the
170 (HTTP or RTMP) download. Boolean.
172 url: Final video URL.
173 ext: Video filename extension.
174 format: The video format, defaults to ext (used for --get-format)
175 player_url: SWF Player URL (used for rtmpdump).
177 The following fields are optional:
179 alt_title: A secondary title of the video.
180 display_id An alternative identifier for the video, not necessarily
181 unique, but available before title. Typically, id is
182 something like "4234987", title "Dancing naked mole rats",
183 and display_id "dancing-naked-mole-rats"
184 thumbnails: A list of dictionaries, with the following entries:
185 * "id" (optional, string) - Thumbnail format ID
187 * "preference" (optional, int) - quality of the image
188 * "width" (optional, int)
189 * "height" (optional, int)
190 * "resolution" (optional, string "{width}x{height"},
192 * "filesize" (optional, int)
193 thumbnail: Full URL to a video thumbnail image.
194 description: Full video description.
195 uploader: Full name of the video uploader.
196 license: License name the video is licensed under.
197 creator: The creator of the video.
198 release_date: The date (YYYYMMDD) when the video was released.
199 timestamp: UNIX timestamp of the moment the video became available.
200 upload_date: Video upload date (YYYYMMDD).
201 If not explicitly set, calculated from timestamp.
202 uploader_id: Nickname or id of the video uploader.
203 uploader_url: Full URL to a personal webpage of the video uploader.
204 location: Physical location where the video was filmed.
205 subtitles: The available subtitles as a dictionary in the format
206 {tag: subformats}. "tag" is usually a language code, and
207 "subformats" is a list sorted from lower to higher
208 preference, each element is a dictionary with the "ext"
210 * "data": The subtitles file contents
211 * "url": A URL pointing to the subtitles file
212 "ext" will be calculated from URL if missing
213 automatic_captions: Like 'subtitles', used by the YoutubeIE for
214 automatically generated captions
215 duration: Length of the video in seconds, as an integer or float.
216 view_count: How many users have watched the video on the platform.
217 like_count: Number of positive ratings of the video
218 dislike_count: Number of negative ratings of the video
219 repost_count: Number of reposts of the video
220 average_rating: Average rating give by users, the scale used depends on the webpage
221 comment_count: Number of comments on the video
222 comments: A list of comments, each with one or more of the following
223 properties (all but one of text or html optional):
224 * "author" - human-readable name of the comment author
225 * "author_id" - user ID of the comment author
227 * "html" - Comment as HTML
228 * "text" - Plain text of the comment
229 * "timestamp" - UNIX timestamp of comment
230 * "parent" - ID of the comment this one is replying to.
231 Set to "root" to indicate that this is a
232 comment to the original video.
233 age_limit: Age restriction for the video, as an integer (years)
234 webpage_url: The URL to the video webpage, if given to youtube-dl it
235 should allow to get the same result again. (It will be set
236 by YoutubeDL if it's missing)
237 categories: A list of categories that the video falls in, for example
239 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240 is_live: True, False, or None (=unknown). Whether this video is a
241 live stream that goes on instead of a fixed-length video.
242 start_time: Time in seconds where the reproduction should start, as
243 specified in the URL.
244 end_time: Time in seconds where the reproduction should end, as
245 specified in the URL.
247 The following fields should only be used when the video belongs to some logical
250 chapter: Name or title of the chapter the video belongs to.
251 chapter_number: Number of the chapter the video belongs to, as an integer.
252 chapter_id: Id of the chapter the video belongs to, as a unicode string.
254 The following fields should only be used when the video is an episode of some
255 series, programme or podcast:
257 series: Title of the series or programme the video episode belongs to.
258 season: Title of the season the video episode belongs to.
259 season_number: Number of the season the video episode belongs to, as an integer.
260 season_id: Id of the season the video episode belongs to, as a unicode string.
261 episode: Title of the video episode. Unlike mandatory video title field,
262 this field should denote the exact title of the video episode
263 without any kind of decoration.
264 episode_number: Number of the video episode within a season, as an integer.
265 episode_id: Id of the video episode, as a unicode string.
267 The following fields should only be used when the media is a track or a part of
270 track: Title of the track.
271 track_number: Number of the track within an album or a disc, as an integer.
272 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 artist: Artist(s) of the track.
275 genre: Genre(s) of the track.
276 album: Title of the album the track belongs to.
277 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278 album_artist: List of all artists appeared on the album (e.g.
279 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 disc_number: Number of the disc or other physical medium the track belongs to,
283 release_year: Year (YYYY) when the album was released.
285 Unless mentioned otherwise, the fields should be Unicode strings.
287 Unless mentioned otherwise, None is equivalent to absence of information.
290 _type "playlist" indicates multiple videos.
291 There must be a key "entries", which is a list, an iterable, or a PagedList
292 object, each element of which is a valid dictionary by this specification.
294 Additionally, playlists can have "title", "description" and "id" attributes
295 with the same semantics as videos (see above).
298 _type "multi_video" indicates that there are multiple videos that
299 form a single show, for examples multiple acts of an opera or TV episode.
300 It must have an entries key like a playlist and contain all the keys
301 required for a video at the same time.
304 _type "url" indicates that the video must be extracted from another
305 location, possibly by a different extractor. Its only required key is:
306 "url" - the next URL to extract.
307 The key "ie_key" can be set to the class name (minus the trailing "IE",
308 e.g. "Youtube") if the extractor class is known in advance.
309 Additionally, the dictionary may have any properties of the resolved entity
310 known in advance, for example "title" if the title of the referred video is
314 _type "url_transparent" entities have the same specification as "url", but
315 indicate that the given additional information is more precise than the one
316 associated with the resolved URL.
317 This is useful when a site employs a video service that hosts the video and
318 its technical metadata, but that video service does not embed a useful
319 title, description etc.
322 Subclasses of this one should re-define the _real_initialize() and
323 _real_extract() methods and define a _VALID_URL regexp.
324 Probably, they should also be added to the list of extractors.
326 _BYPASS_GEO attribute may be set to False in order to disable
327 geo restriction bypass mechanisms for a particular extractor.
328 Though it won't disable explicit geo restriction bypass based on
329 country code provided with geo_bypass_country.
331 Finally, the _WORKING attribute should be set to False for broken IEs
332 in order to warn the users and skip the tests.
337 _x_forwarded_for_ip = None
341 def __init__(self, downloader=None):
342 """Constructor. Receives an optional downloader."""
344 self._x_forwarded_for_ip = None
345 self.set_downloader(downloader)
348 def suitable(cls, url):
349 """Receives a URL and returns True if suitable for this IE."""
351 # This does not use has/getattr intentionally - we want to know whether
352 # we have cached the regexp for *this* class, whereas getattr would also
353 # match the superclass
354 if '_VALID_URL_RE' not in cls.__dict__:
355 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
356 return cls._VALID_URL_RE.match(url) is not None
359 def _match_id(cls, url):
360 if '_VALID_URL_RE' not in cls.__dict__:
361 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
362 m = cls._VALID_URL_RE.match(url)
368 """Getter method for _WORKING."""
371 def initialize(self):
372 """Initializes an instance (authentication, etc)."""
373 if not self._x_forwarded_for_ip:
374 country_code = self._downloader.params.get('geo_bypass_country', None)
376 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
378 self._real_initialize()
381 def extract(self, url):
382 """Extracts URL information and returns it in list of dicts."""
387 ie_result = self._real_extract(url)
388 if self._x_forwarded_for_ip:
389 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
391 except GeoRestrictedError as e:
392 if (not self._downloader.params.get('geo_bypass_country', None) and
394 self._downloader.params.get('geo_bypass', True) and
395 not self._x_forwarded_for_ip and
397 self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
398 if self._x_forwarded_for_ip:
400 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
403 except ExtractorError:
405 except compat_http_client.IncompleteRead as e:
406 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
407 except (KeyError, StopIteration) as e:
408 raise ExtractorError('An extractor error has occurred.', cause=e)
410 def set_downloader(self, downloader):
411 """Sets the downloader for this IE."""
412 self._downloader = downloader
414 def _real_initialize(self):
415 """Real initialization process. Redefine in subclasses."""
418 def _real_extract(self, url):
419 """Real extraction process. Redefine in subclasses."""
424 """A string for getting the InfoExtractor with get_info_extractor"""
425 return compat_str(cls.__name__[:-2])
429 return compat_str(type(self).__name__[:-2])
431 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
432 """ Returns the response handle """
434 self.report_download_webpage(video_id)
435 elif note is not False:
437 self.to_screen('%s' % (note,))
439 self.to_screen('%s: %s' % (video_id, note))
440 if isinstance(url_or_request, compat_urllib_request.Request):
441 url_or_request = update_Request(
442 url_or_request, data=data, headers=headers, query=query)
445 url_or_request = update_url_query(url_or_request, query)
446 if data is not None or headers:
447 url_or_request = sanitized_Request(url_or_request, data, headers)
449 return self._downloader.urlopen(url_or_request)
450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454 errnote = 'Unable to download webpage'
456 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
458 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
460 self._downloader.report_warning(errmsg)
463 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
464 """ Returns a tuple (page content as string, URL handle) """
465 # Strip hashes from the URL (#1038)
466 if isinstance(url_or_request, (compat_str, str)):
467 url_or_request = url_or_request.partition('#')[0]
469 # Some sites check X-Forwarded-For HTTP header in order to figure out
470 # the origin of the client behind proxy. This allows bypassing geo
471 # restriction by faking this header's value to IP that belongs to some
472 # geo unrestricted country. We will do so once we encounter any
473 # geo restriction error.
474 if self._x_forwarded_for_ip:
475 if 'X-Forwarded-For' not in headers:
476 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
478 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
482 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
483 return (content, urlh)
486 def _guess_encoding_from_content(content_type, webpage_bytes):
487 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
489 encoding = m.group(1)
491 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
492 webpage_bytes[:1024])
494 encoding = m.group(1).decode('ascii')
495 elif webpage_bytes.startswith(b'\xff\xfe'):
502 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
503 content_type = urlh.headers.get('Content-Type', '')
504 webpage_bytes = urlh.read()
505 if prefix is not None:
506 webpage_bytes = prefix + webpage_bytes
508 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
509 if self._downloader.params.get('dump_intermediate_pages', False):
511 url = url_or_request.get_full_url()
512 except AttributeError:
514 self.to_screen('Dumping request to ' + url)
515 dump = base64.b64encode(webpage_bytes).decode('ascii')
516 self._downloader.to_screen(dump)
517 if self._downloader.params.get('write_pages', False):
519 url = url_or_request.get_full_url()
520 except AttributeError:
522 basen = '%s_%s' % (video_id, url)
524 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
525 basen = basen[:240 - len(h)] + h
526 raw_filename = basen + '.dump'
527 filename = sanitize_filename(raw_filename, restricted=True)
528 self.to_screen('Saving request to ' + filename)
529 # Working around MAX_PATH limitation on Windows (see
530 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
531 if compat_os_name == 'nt':
532 absfilepath = os.path.abspath(filename)
533 if len(absfilepath) > 259:
534 filename = '\\\\?\\' + absfilepath
535 with open(filename, 'wb') as outf:
536 outf.write(webpage_bytes)
539 content = webpage_bytes.decode(encoding, 'replace')
541 content = webpage_bytes.decode('utf-8', 'replace')
543 if ('<title>Access to this site is blocked</title>' in content and
544 'Websense' in content[:512]):
545 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
546 blocked_iframe = self._html_search_regex(
547 r'<iframe src="([^"]+)"', content,
548 'Websense information URL', default=None)
550 msg += ' Visit %s for more details' % blocked_iframe
551 raise ExtractorError(msg, expected=True)
552 if '<title>The URL you requested has been blocked</title>' in content[:512]:
554 'Access to this webpage has been blocked by Indian censorship. '
555 'Use a VPN or proxy server (with --proxy) to route around it.')
556 block_msg = self._html_search_regex(
557 r'</h1><p>(.*?)</p>',
558 content, 'block message', default=None)
560 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
561 raise ExtractorError(msg, expected=True)
565 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
566 """ Returns the data of the page as a string """
569 while success is False:
571 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
573 except compat_http_client.IncompleteRead as e:
575 if try_count >= tries:
577 self._sleep(timeout, video_id)
584 def _download_xml(self, url_or_request, video_id,
585 note='Downloading XML', errnote='Unable to download XML',
586 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
587 """Return the xml as an xml.etree.ElementTree.Element"""
588 xml_string = self._download_webpage(
589 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
590 if xml_string is False:
593 xml_string = transform_source(xml_string)
594 return compat_etree_fromstring(xml_string.encode('utf-8'))
596 def _download_json(self, url_or_request, video_id,
597 note='Downloading JSON metadata',
598 errnote='Unable to download JSON metadata',
599 transform_source=None,
600 fatal=True, encoding=None, data=None, headers={}, query={}):
601 json_string = self._download_webpage(
602 url_or_request, video_id, note, errnote, fatal=fatal,
603 encoding=encoding, data=data, headers=headers, query=query)
604 if (not fatal) and json_string is False:
606 return self._parse_json(
607 json_string, video_id, transform_source=transform_source, fatal=fatal)
609 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
611 json_string = transform_source(json_string)
613 return json.loads(json_string)
614 except ValueError as ve:
615 errmsg = '%s: Failed to parse JSON ' % video_id
617 raise ExtractorError(errmsg, cause=ve)
619 self.report_warning(errmsg + str(ve))
621 def report_warning(self, msg, video_id=None):
622 idstr = '' if video_id is None else '%s: ' % video_id
623 self._downloader.report_warning(
624 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
626 def to_screen(self, msg):
627 """Print msg to screen, prefixing it with '[ie_name]'"""
628 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
630 def report_extraction(self, id_or_name):
631 """Report information extraction."""
632 self.to_screen('%s: Extracting information' % id_or_name)
634 def report_download_webpage(self, video_id):
635 """Report webpage download."""
636 self.to_screen('%s: Downloading webpage' % video_id)
638 def report_age_confirmation(self):
639 """Report attempt to confirm age."""
640 self.to_screen('Confirming age')
642 def report_login(self):
643 """Report attempt to log in."""
644 self.to_screen('Logging in')
647 def raise_login_required(msg='This video is only available for registered users'):
648 raise ExtractorError(
649 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
653 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
654 raise GeoRestrictedError(msg, countries=countries)
656 # Methods for following #608
658 def url_result(url, ie=None, video_id=None, video_title=None):
659 """Returns a URL that points to a page that should be processed"""
660 # TODO: ie should be the class used for getting the info
661 video_info = {'_type': 'url',
664 if video_id is not None:
665 video_info['id'] = video_id
666 if video_title is not None:
667 video_info['title'] = video_title
671 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
672 """Returns a playlist"""
673 video_info = {'_type': 'playlist',
676 video_info['id'] = playlist_id
678 video_info['title'] = playlist_title
679 if playlist_description:
680 video_info['description'] = playlist_description
683 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
685 Perform a regex search on the given string, using a single or a list of
686 patterns returning the first matching group.
687 In case of failure return a default value or raise a WARNING or a
688 RegexNotFoundError, depending on fatal, specifying the field name.
690 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
691 mobj = re.search(pattern, string, flags)
694 mobj = re.search(p, string, flags)
698 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
699 _name = '\033[0;34m%s\033[0m' % name
705 # return the first matching group
706 return next(g for g in mobj.groups() if g is not None)
708 return mobj.group(group)
709 elif default is not NO_DEFAULT:
712 raise RegexNotFoundError('Unable to extract %s' % _name)
714 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
717 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
719 Like _search_regex, but strips HTML tags and unescapes entities.
721 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
723 return clean_html(res).strip()
727 def _get_netrc_login_info(self, netrc_machine=None):
730 netrc_machine = netrc_machine or self._NETRC_MACHINE
732 if self._downloader.params.get('usenetrc', False):
734 info = netrc.netrc().authenticators(netrc_machine)
739 raise netrc.NetrcParseError(
740 'No authenticators for %s' % netrc_machine)
741 except (IOError, netrc.NetrcParseError) as err:
742 self._downloader.report_warning(
743 'parsing .netrc: %s' % error_to_compat_str(err))
745 return username, password
747 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
749 Get the login info as (username, password)
750 First look for the manually specified credentials using username_option
751 and password_option as keys in params dictionary. If no such credentials
752 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
754 If there's no info available, return (None, None)
756 if self._downloader is None:
759 downloader_params = self._downloader.params
761 # Attempt to use provided username and password or .netrc data
762 if downloader_params.get(username_option) is not None:
763 username = downloader_params[username_option]
764 password = downloader_params[password_option]
766 username, password = self._get_netrc_login_info(netrc_machine)
768 return username, password
770 def _get_tfa_info(self, note='two-factor verification code'):
772 Get the two-factor authentication info
773 TODO - asking the user will be required for sms/phone verify
774 currently just uses the command line option
775 If there's no info available, return None
777 if self._downloader is None:
779 downloader_params = self._downloader.params
781 if downloader_params.get('twofactor') is not None:
782 return downloader_params['twofactor']
784 return compat_getpass('Type %s and press [Return]: ' % note)
786 # Helper functions for extracting OpenGraph info
788 def _og_regexes(prop):
789 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
790 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
791 % {'prop': re.escape(prop)})
792 template = r'<meta[^>]+?%s[^>]+?%s'
794 template % (property_re, content_re),
795 template % (content_re, property_re),
799 def _meta_regex(prop):
800 return r'''(?isx)<meta
801 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
802 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
804 def _og_search_property(self, prop, html, name=None, **kargs):
805 if not isinstance(prop, (list, tuple)):
808 name = 'OpenGraph %s' % prop[0]
811 og_regexes.extend(self._og_regexes(p))
812 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
815 return unescapeHTML(escaped)
817 def _og_search_thumbnail(self, html, **kargs):
818 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
820 def _og_search_description(self, html, **kargs):
821 return self._og_search_property('description', html, fatal=False, **kargs)
823 def _og_search_title(self, html, **kargs):
824 return self._og_search_property('title', html, **kargs)
826 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
827 regexes = self._og_regexes('video') + self._og_regexes('video:url')
829 regexes = self._og_regexes('video:secure_url') + regexes
830 return self._html_search_regex(regexes, html, name, **kargs)
832 def _og_search_url(self, html, **kargs):
833 return self._og_search_property('url', html, **kargs)
835 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
836 if not isinstance(name, (list, tuple)):
838 if display_name is None:
839 display_name = name[0]
840 return self._html_search_regex(
841 [self._meta_regex(n) for n in name],
842 html, display_name, fatal=fatal, group='content', **kwargs)
844 def _dc_search_uploader(self, html):
845 return self._html_search_meta('dc.creator', html, 'uploader')
847 def _rta_search(self, html):
848 # See http://www.rtalabel.org/index.php?content=howtofaq#single
849 if re.search(r'(?ix)<meta\s+name="rating"\s+'
850 r' content="RTA-5042-1996-1400-1577-RTA"',
855 def _media_rating_search(self, html):
856 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
857 rating = self._html_search_meta('rating', html)
869 return RATING_TABLE.get(rating.lower())
871 def _family_friendly_search(self, html):
872 # See http://schema.org/VideoObject
873 family_friendly = self._html_search_meta('isFamilyFriendly', html)
875 if not family_friendly:
884 return RATING_TABLE.get(family_friendly.lower())
886 def _twitter_search_player(self, html):
887 return self._html_search_meta('twitter:player', html,
888 'twitter card player')
890 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
891 json_ld = self._search_regex(
892 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
893 html, 'JSON-LD', group='json_ld', **kwargs)
894 default = kwargs.get('default', NO_DEFAULT)
896 return default if default is not NO_DEFAULT else {}
897 # JSON-LD may be malformed and thus `fatal` should be respected.
898 # At the same time `default` may be passed that assumes `fatal=False`
899 # for _search_regex. Let's simulate the same behavior here as well.
900 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
901 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
903 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
904 if isinstance(json_ld, compat_str):
905 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
909 if not isinstance(json_ld, (list, tuple, dict)):
911 if isinstance(json_ld, dict):
914 if e.get('@context') == 'http://schema.org':
915 item_type = e.get('@type')
916 if expected_type is not None and expected_type != item_type:
918 if item_type == 'TVEpisode':
920 'episode': unescapeHTML(e.get('name')),
921 'episode_number': int_or_none(e.get('episodeNumber')),
922 'description': unescapeHTML(e.get('description')),
924 part_of_season = e.get('partOfSeason')
925 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
926 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
927 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
928 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
929 info['series'] = unescapeHTML(part_of_series.get('name'))
930 elif item_type == 'Article':
932 'timestamp': parse_iso8601(e.get('datePublished')),
933 'title': unescapeHTML(e.get('headline')),
934 'description': unescapeHTML(e.get('articleBody')),
936 elif item_type == 'VideoObject':
938 'url': e.get('contentUrl'),
939 'title': unescapeHTML(e.get('name')),
940 'description': unescapeHTML(e.get('description')),
941 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
942 'duration': parse_duration(e.get('duration')),
943 'timestamp': unified_timestamp(e.get('uploadDate')),
944 'filesize': float_or_none(e.get('contentSize')),
945 'tbr': int_or_none(e.get('bitrate')),
946 'width': int_or_none(e.get('width')),
947 'height': int_or_none(e.get('height')),
950 return dict((k, v) for k, v in info.items() if v is not None)
953 def _hidden_inputs(html):
954 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
956 for input in re.findall(r'(?i)(<input[^>]+>)', html):
957 attrs = extract_attributes(input)
960 if attrs.get('type') not in ('hidden', 'submit'):
962 name = attrs.get('name') or attrs.get('id')
963 value = attrs.get('value')
964 if name and value is not None:
965 hidden_inputs[name] = value
968 def _form_hidden_inputs(self, form_id, html):
969 form = self._search_regex(
970 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
971 html, '%s form' % form_id, group='form')
972 return self._hidden_inputs(form)
974 def _sort_formats(self, formats, field_preference=None):
976 raise ExtractorError('No video formats found')
979 # Automatically determine tbr when missing based on abr and vbr (improves
980 # formats sorting in some cases)
981 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
982 f['tbr'] = f['abr'] + f['vbr']
985 # TODO remove the following workaround
986 from ..utils import determine_ext
987 if not f.get('ext') and 'url' in f:
988 f['ext'] = determine_ext(f['url'])
990 if isinstance(field_preference, (list, tuple)):
993 if f.get(field) is not None
994 else ('' if field == 'format_id' else -1)
995 for field in field_preference)
997 preference = f.get('preference')
998 if preference is None:
1000 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1003 protocol = f.get('protocol') or determine_protocol(f)
1004 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1006 if f.get('vcodec') == 'none': # audio only
1008 if self._downloader.params.get('prefer_free_formats'):
1009 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1011 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1014 audio_ext_preference = ORDER.index(f['ext'])
1016 audio_ext_preference = -1
1018 if f.get('acodec') == 'none': # video only
1020 if self._downloader.params.get('prefer_free_formats'):
1021 ORDER = ['flv', 'mp4', 'webm']
1023 ORDER = ['webm', 'flv', 'mp4']
1025 ext_preference = ORDER.index(f['ext'])
1028 audio_ext_preference = 0
1032 f.get('language_preference') if f.get('language_preference') is not None else -1,
1033 f.get('quality') if f.get('quality') is not None else -1,
1034 f.get('tbr') if f.get('tbr') is not None else -1,
1035 f.get('filesize') if f.get('filesize') is not None else -1,
1036 f.get('vbr') if f.get('vbr') is not None else -1,
1037 f.get('height') if f.get('height') is not None else -1,
1038 f.get('width') if f.get('width') is not None else -1,
1041 f.get('abr') if f.get('abr') is not None else -1,
1042 audio_ext_preference,
1043 f.get('fps') if f.get('fps') is not None else -1,
1044 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1045 f.get('source_preference') if f.get('source_preference') is not None else -1,
1046 f.get('format_id') if f.get('format_id') is not None else '',
1048 formats.sort(key=_formats_key)
1050 def _check_formats(self, formats, video_id):
1052 formats[:] = filter(
1053 lambda f: self._is_valid_url(
1055 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1059 def _remove_duplicate_formats(formats):
1063 if f['url'] not in format_urls:
1064 format_urls.add(f['url'])
1065 unique_formats.append(f)
1066 formats[:] = unique_formats
1068 def _is_valid_url(self, url, video_id, item='video', headers={}):
1069 url = self._proto_relative_url(url, scheme='http:')
1070 # For now assume non HTTP(S) URLs always valid
1071 if not (url.startswith('http://') or url.startswith('https://')):
1074 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1076 except ExtractorError as e:
1077 if isinstance(e.cause, compat_urllib_error.URLError):
1079 '%s: %s URL is invalid, skipping' % (video_id, item))
1083 def http_scheme(self):
1084 """ Either "http:" or "https:", depending on the user's preferences """
1087 if self._downloader.params.get('prefer_insecure', False)
1090 def _proto_relative_url(self, url, scheme=None):
1093 if url.startswith('//'):
1095 scheme = self.http_scheme()
1100 def _sleep(self, timeout, video_id, msg_template=None):
1101 if msg_template is None:
1102 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1103 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1107 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1108 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1109 fatal=True, m3u8_id=None):
1110 manifest = self._download_xml(
1111 manifest_url, video_id, 'Downloading f4m manifest',
1112 'Unable to download f4m manifest',
1113 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1114 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1115 transform_source=transform_source,
1118 if manifest is False:
1121 return self._parse_f4m_formats(
1122 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1123 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1125 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1126 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1127 fatal=True, m3u8_id=None):
1128 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1129 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1130 if akamai_pv is not None and ';' in akamai_pv.text:
1131 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1132 if playerVerificationChallenge.strip() != '':
1136 manifest_version = '1.0'
1137 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1139 manifest_version = '2.0'
1140 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1141 # Remove unsupported DRM protected media from final formats
1142 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1143 media_nodes = remove_encrypted_media(media_nodes)
1146 base_url = xpath_text(
1147 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1148 'base URL', default=None)
1150 base_url = base_url.strip()
1152 bootstrap_info = xpath_element(
1153 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1154 'bootstrap info', default=None)
1157 mime_type = xpath_text(
1158 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1159 'base URL', default=None)
1160 if mime_type and mime_type.startswith('audio/'):
1163 for i, media_el in enumerate(media_nodes):
1164 tbr = int_or_none(media_el.attrib.get('bitrate'))
1165 width = int_or_none(media_el.attrib.get('width'))
1166 height = int_or_none(media_el.attrib.get('height'))
1167 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1168 # If <bootstrapInfo> is present, the specified f4m is a
1169 # stream-level manifest, and only set-level manifests may refer to
1170 # external resources. See section 11.4 and section 4 of F4M spec
1171 if bootstrap_info is None:
1173 # @href is introduced in 2.0, see section 11.6 of F4M spec
1174 if manifest_version == '2.0':
1175 media_url = media_el.attrib.get('href')
1176 if media_url is None:
1177 media_url = media_el.attrib.get('url')
1181 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1182 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1183 # If media_url is itself a f4m manifest do the recursive extraction
1184 # since bitrates in parent manifest (this one) and media_url manifest
1185 # may differ leading to inability to resolve the format by requested
1186 # bitrate in f4m downloader
1187 ext = determine_ext(manifest_url)
1189 f4m_formats = self._extract_f4m_formats(
1190 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1191 transform_source=transform_source, fatal=fatal)
1192 # Sometimes stream-level manifest contains single media entry that
1193 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1194 # At the same time parent's media entry in set-level manifest may
1195 # contain it. We will copy it from parent in such cases.
1196 if len(f4m_formats) == 1:
1199 'tbr': f.get('tbr') or tbr,
1200 'width': f.get('width') or width,
1201 'height': f.get('height') or height,
1202 'format_id': f.get('format_id') if not tbr else format_id,
1205 formats.extend(f4m_formats)
1208 formats.extend(self._extract_m3u8_formats(
1209 manifest_url, video_id, 'mp4', preference=preference,
1210 m3u8_id=m3u8_id, fatal=fatal))
1213 'format_id': format_id,
1214 'url': manifest_url,
1215 'manifest_url': manifest_url,
1216 'ext': 'flv' if bootstrap_info is not None else None,
1221 'preference': preference,
1225 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1227 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1231 'preference': preference - 100 if preference else -100,
1232 'resolution': 'multiple',
1233 'format_note': 'Quality selection URL',
1236 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1237 entry_protocol='m3u8', preference=None,
1238 m3u8_id=None, note=None, errnote=None,
1239 fatal=True, live=False):
1241 res = self._download_webpage_handle(
1243 note=note or 'Downloading m3u8 information',
1244 errnote=errnote or 'Failed to download m3u8 information',
1248 m3u8_doc, urlh = res
1249 m3u8_url = urlh.geturl()
1251 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1254 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1256 format_url = lambda u: (
1258 if re.match(r'^https?://', u)
1259 else compat_urlparse.urljoin(m3u8_url, u))
1261 # We should try extracting formats only from master playlists [1], i.e.
1262 # playlists that describe available qualities. On the other hand media
1263 # playlists [2] should be returned as is since they contain just the media
1264 # without qualities renditions.
1265 # Fortunately, master playlist can be easily distinguished from media
1266 # playlist based on particular tags availability. As of [1, 2] master
1267 # playlist tags MUST NOT appear in a media playist and vice versa.
1268 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1269 # and MUST NOT appear in master playlist thus we can clearly detect media
1270 # playlist with this criterion.
1271 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1272 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1273 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1274 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1277 'format_id': m3u8_id,
1279 'protocol': entry_protocol,
1280 'preference': preference,
1282 audio_in_video_stream = {}
1285 for line in m3u8_doc.splitlines():
1286 if line.startswith('#EXT-X-STREAM-INF:'):
1287 last_info = parse_m3u8_attributes(line)
1288 elif line.startswith('#EXT-X-MEDIA:'):
1289 media = parse_m3u8_attributes(line)
1290 media_type = media.get('TYPE')
1291 if media_type in ('VIDEO', 'AUDIO'):
1292 group_id = media.get('GROUP-ID')
1293 media_url = media.get('URI')
1296 for v in (group_id, media.get('NAME')):
1300 'format_id': '-'.join(format_id),
1301 'url': format_url(media_url),
1302 'language': media.get('LANGUAGE'),
1304 'protocol': entry_protocol,
1305 'preference': preference,
1307 if media_type == 'AUDIO':
1308 f['vcodec'] = 'none'
1309 if group_id and not audio_in_video_stream.get(group_id):
1310 audio_in_video_stream[group_id] = False
1313 # When there is no URI in EXT-X-MEDIA let this tag's
1314 # data be used by regular URI lines below
1316 if media_type == 'AUDIO' and group_id:
1317 audio_in_video_stream[group_id] = True
1318 elif line.startswith('#') or not line.strip():
1321 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1324 format_id.append(m3u8_id)
1325 # Despite specification does not mention NAME attribute for
1326 # EXT-X-STREAM-INF it still sometimes may be present
1327 stream_name = last_info.get('NAME') or last_media.get('NAME')
1328 # Bandwidth of live streams may differ over time thus making
1329 # format_id unpredictable. So it's better to keep provided
1332 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1333 manifest_url = format_url(line.strip())
1335 'format_id': '-'.join(format_id),
1336 'url': manifest_url,
1337 'manifest_url': manifest_url,
1340 'fps': float_or_none(last_info.get('FRAME-RATE')),
1341 'protocol': entry_protocol,
1342 'preference': preference,
1344 resolution = last_info.get('RESOLUTION')
1346 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1348 f['width'] = int(mobj.group('width'))
1349 f['height'] = int(mobj.group('height'))
1350 # Unified Streaming Platform
1352 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1354 abr, vbr = mobj.groups()
1355 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1360 f.update(parse_codecs(last_info.get('CODECS')))
1361 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1362 # TODO: update acodec for audio only formats with the same GROUP-ID
1363 f['acodec'] = 'none'
1370 def _xpath_ns(path, namespace=None):
1374 for c in path.split('/'):
1375 if not c or c == '.':
1378 out.append('{%s}%s' % (namespace, c))
1379 return '/'.join(out)
1381 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1382 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1388 namespace = self._parse_smil_namespace(smil)
1390 return self._parse_smil_formats(
1391 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1393 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1394 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1397 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1399 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1400 return self._download_xml(
1401 smil_url, video_id, 'Downloading SMIL file',
1402 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1404 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1405 namespace = self._parse_smil_namespace(smil)
1407 formats = self._parse_smil_formats(
1408 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1409 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1411 video_id = os.path.splitext(url_basename(smil_url))[0]
1415 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1416 name = meta.attrib.get('name')
1417 content = meta.attrib.get('content')
1418 if not name or not content:
1420 if not title and name == 'title':
1422 elif not description and name in ('description', 'abstract'):
1423 description = content
1424 elif not upload_date and name == 'date':
1425 upload_date = unified_strdate(content)
1428 'id': image.get('type'),
1429 'url': image.get('src'),
1430 'width': int_or_none(image.get('width')),
1431 'height': int_or_none(image.get('height')),
1432 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1436 'title': title or video_id,
1437 'description': description,
1438 'upload_date': upload_date,
1439 'thumbnails': thumbnails,
1441 'subtitles': subtitles,
1444 def _parse_smil_namespace(self, smil):
1445 return self._search_regex(
1446 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1448 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1450 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1451 b = meta.get('base') or meta.get('httpBase')
1462 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1463 for medium in media:
1464 src = medium.get('src')
1465 if not src or src in srcs:
1469 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1470 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1471 width = int_or_none(medium.get('width'))
1472 height = int_or_none(medium.get('height'))
1473 proto = medium.get('proto')
1474 ext = medium.get('ext')
1475 src_ext = determine_ext(src)
1476 streamer = medium.get('streamer') or base
1478 if proto == 'rtmp' or streamer.startswith('rtmp'):
1484 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1486 'filesize': filesize,
1490 if transform_rtmp_url:
1491 streamer, src = transform_rtmp_url(streamer, src)
1492 formats[-1].update({
1498 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1499 src_url = src_url.strip()
1501 if proto == 'm3u8' or src_ext == 'm3u8':
1502 m3u8_formats = self._extract_m3u8_formats(
1503 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1504 if len(m3u8_formats) == 1:
1506 m3u8_formats[0].update({
1507 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1512 formats.extend(m3u8_formats)
1515 if src_ext == 'f4m':
1520 'plugin': 'flowplayer-3.2.0.1',
1522 f4m_url += '&' if '?' in f4m_url else '?'
1523 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1524 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1527 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1531 'ext': ext or src_ext or 'flv',
1532 'format_id': 'http-%d' % (bitrate or http_count),
1534 'filesize': filesize,
1542 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1545 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1546 src = textstream.get('src')
1547 if not src or src in urls:
1550 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1551 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1552 subtitles.setdefault(lang, []).append({
1558 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1559 xspf = self._download_xml(
1560 playlist_url, playlist_id, 'Downloading xpsf playlist',
1561 'Unable to download xspf manifest', fatal=fatal)
1564 return self._parse_xspf(xspf, playlist_id)
1566 def _parse_xspf(self, playlist, playlist_id):
1568 'xspf': 'http://xspf.org/ns/0/',
1569 's1': 'http://static.streamone.nl/player/ns/0',
1573 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1575 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1576 description = xpath_text(
1577 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1578 thumbnail = xpath_text(
1579 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1580 duration = float_or_none(
1581 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1584 'url': location.text,
1585 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1586 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1587 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1588 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1589 self._sort_formats(formats)
1594 'description': description,
1595 'thumbnail': thumbnail,
1596 'duration': duration,
1601 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1602 res = self._download_webpage_handle(
1604 note=note or 'Downloading MPD manifest',
1605 errnote=errnote or 'Failed to download MPD manifest',
1610 mpd_base_url = base_url(urlh.geturl())
1612 return self._parse_mpd_formats(
1613 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1614 formats_dict=formats_dict, mpd_url=mpd_url)
1616 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1618 Parse formats from MPD manifest.
1620 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1621 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1622 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1624 if mpd_doc.get('type') == 'dynamic':
1627 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1630 return self._xpath_ns(path, namespace)
1632 def is_drm_protected(element):
1633 return element.find(_add_ns('ContentProtection')) is not None
1635 def extract_multisegment_info(element, ms_parent_info):
1636 ms_info = ms_parent_info.copy()
1638 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1639 # common attributes and elements. We will only extract relevant
1641 def extract_common(source):
1642 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1643 if segment_timeline is not None:
1644 s_e = segment_timeline.findall(_add_ns('S'))
1646 ms_info['total_number'] = 0
1649 r = int(s.get('r', 0))
1650 ms_info['total_number'] += 1 + r
1651 ms_info['s'].append({
1652 't': int(s.get('t', 0)),
1653 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1654 'd': int(s.attrib['d']),
1657 start_number = source.get('startNumber')
1659 ms_info['start_number'] = int(start_number)
1660 timescale = source.get('timescale')
1662 ms_info['timescale'] = int(timescale)
1663 segment_duration = source.get('duration')
1664 if segment_duration:
1665 ms_info['segment_duration'] = int(segment_duration)
1667 def extract_Initialization(source):
1668 initialization = source.find(_add_ns('Initialization'))
1669 if initialization is not None:
1670 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1672 segment_list = element.find(_add_ns('SegmentList'))
1673 if segment_list is not None:
1674 extract_common(segment_list)
1675 extract_Initialization(segment_list)
1676 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1678 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1680 segment_template = element.find(_add_ns('SegmentTemplate'))
1681 if segment_template is not None:
1682 extract_common(segment_template)
1683 media = segment_template.get('media')
1685 ms_info['media'] = media
1686 initialization = segment_template.get('initialization')
1688 ms_info['initialization'] = initialization
1690 extract_Initialization(segment_template)
1693 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1695 for period in mpd_doc.findall(_add_ns('Period')):
1696 period_duration = parse_duration(period.get('duration')) or mpd_duration
1697 period_ms_info = extract_multisegment_info(period, {
1701 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1702 if is_drm_protected(adaptation_set):
1704 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1705 for representation in adaptation_set.findall(_add_ns('Representation')):
1706 if is_drm_protected(representation):
1708 representation_attrib = adaptation_set.attrib.copy()
1709 representation_attrib.update(representation.attrib)
1710 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1711 mime_type = representation_attrib['mimeType']
1712 content_type = mime_type.split('/')[0]
1713 if content_type == 'text':
1714 # TODO implement WebVTT downloading
1716 elif content_type == 'video' or content_type == 'audio':
1718 for element in (representation, adaptation_set, period, mpd_doc):
1719 base_url_e = element.find(_add_ns('BaseURL'))
1720 if base_url_e is not None:
1721 base_url = base_url_e.text + base_url
1722 if re.match(r'^https?://', base_url):
1724 if mpd_base_url and not re.match(r'^https?://', base_url):
1725 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1727 base_url = mpd_base_url + base_url
1728 representation_id = representation_attrib.get('id')
1729 lang = representation_attrib.get('lang')
1730 url_el = representation.find(_add_ns('BaseURL'))
1731 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1732 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1734 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1736 'manifest_url': mpd_url,
1737 'ext': mimetype2ext(mime_type),
1738 'width': int_or_none(representation_attrib.get('width')),
1739 'height': int_or_none(representation_attrib.get('height')),
1740 'tbr': int_or_none(bandwidth, 1000),
1741 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1742 'fps': int_or_none(representation_attrib.get('frameRate')),
1743 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1744 'format_note': 'DASH %s' % content_type,
1745 'filesize': filesize,
1747 f.update(parse_codecs(representation_attrib.get('codecs')))
1748 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1750 def prepare_template(template_name, identifiers):
1751 t = representation_ms_info[template_name]
1752 t = t.replace('$RepresentationID$', representation_id)
1753 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1754 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1755 t.replace('$$', '$')
1758 # @initialization is a regular template like @media one
1759 # so it should be handled just the same way (see
1760 # https://github.com/rg3/youtube-dl/issues/11605)
1761 if 'initialization' in representation_ms_info:
1762 initialization_template = prepare_template(
1764 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1765 # $Time$ shall not be included for @initialization thus
1766 # only $Bandwidth$ remains
1768 representation_ms_info['initialization_url'] = initialization_template % {
1769 'Bandwidth': bandwidth,
1772 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1774 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1776 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1777 # can't be used at the same time
1778 if '%(Number' in media_template and 's' not in representation_ms_info:
1779 segment_duration = None
1780 if 'total_number' not in representation_ms_info and 'segment_duration':
1781 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1782 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1783 representation_ms_info['fragments'] = [{
1784 'url': media_template % {
1785 'Number': segment_number,
1786 'Bandwidth': bandwidth,
1788 'duration': segment_duration,
1789 } for segment_number in range(
1790 representation_ms_info['start_number'],
1791 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1793 # $Number*$ or $Time$ in media template with S list available
1794 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1795 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1796 representation_ms_info['fragments'] = []
1799 segment_number = representation_ms_info['start_number']
1801 def add_segment_url():
1802 segment_url = media_template % {
1803 'Time': segment_time,
1804 'Bandwidth': bandwidth,
1805 'Number': segment_number,
1807 representation_ms_info['fragments'].append({
1809 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1812 for num, s in enumerate(representation_ms_info['s']):
1813 segment_time = s.get('t') or segment_time
1817 for r in range(s.get('r', 0)):
1818 segment_time += segment_d
1821 segment_time += segment_d
1822 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1824 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1825 # or any YouTube dashsegments video
1828 timescale = representation_ms_info['timescale']
1829 for s in representation_ms_info['s']:
1830 duration = float_or_none(s['d'], timescale)
1831 for r in range(s.get('r', 0) + 1):
1833 'url': representation_ms_info['segment_urls'][segment_index],
1834 'duration': duration,
1837 representation_ms_info['fragments'] = fragments
1838 # NB: MPD manifest may contain direct URLs to unfragmented media.
1839 # No fragments key is present in this case.
1840 if 'fragments' in representation_ms_info:
1843 'protocol': 'http_dash_segments',
1845 if 'initialization_url' in representation_ms_info:
1846 initialization_url = representation_ms_info['initialization_url']
1847 if not f.get('url'):
1848 f['url'] = initialization_url
1849 f['fragments'].append({'url': initialization_url})
1850 f['fragments'].extend(representation_ms_info['fragments'])
1851 for fragment in f['fragments']:
1852 fragment['url'] = urljoin(base_url, fragment['url'])
1854 existing_format = next(
1855 fo for fo in formats
1856 if fo['format_id'] == representation_id)
1857 except StopIteration:
1858 full_info = formats_dict.get(representation_id, {}).copy()
1860 formats.append(full_info)
1862 existing_format.update(f)
1864 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1867 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1868 res = self._download_webpage_handle(
1870 note=note or 'Downloading ISM manifest',
1871 errnote=errnote or 'Failed to download ISM manifest',
1877 return self._parse_ism_formats(
1878 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1880 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1881 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1884 duration = int(ism_doc.attrib['Duration'])
1885 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1888 for stream in ism_doc.findall('StreamIndex'):
1889 stream_type = stream.get('Type')
1890 if stream_type not in ('video', 'audio'):
1892 url_pattern = stream.attrib['Url']
1893 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1894 stream_name = stream.get('Name')
1895 for track in stream.findall('QualityLevel'):
1896 fourcc = track.get('FourCC')
1897 # TODO: add support for WVC1 and WMAP
1898 if fourcc not in ('H264', 'AVC1', 'AACL'):
1899 self.report_warning('%s is not a supported codec' % fourcc)
1901 tbr = int(track.attrib['Bitrate']) // 1000
1902 width = int_or_none(track.get('MaxWidth'))
1903 height = int_or_none(track.get('MaxHeight'))
1904 sampling_rate = int_or_none(track.get('SamplingRate'))
1906 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1907 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1913 stream_fragments = stream.findall('c')
1914 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1915 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1916 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1917 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1918 if not fragment_ctx['duration']:
1920 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1922 next_fragment_time = duration
1923 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1924 for _ in range(fragment_repeat):
1926 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1927 'duration': fragment_ctx['duration'] / stream_timescale,
1929 fragment_ctx['time'] += fragment_ctx['duration']
1933 format_id.append(ism_id)
1935 format_id.append(stream_name)
1936 format_id.append(compat_str(tbr))
1939 'format_id': '-'.join(format_id),
1941 'manifest_url': ism_url,
1942 'ext': 'ismv' if stream_type == 'video' else 'isma',
1946 'asr': sampling_rate,
1947 'vcodec': 'none' if stream_type == 'audio' else fourcc,
1948 'acodec': 'none' if stream_type == 'video' else fourcc,
1950 'fragments': fragments,
1951 '_download_params': {
1952 'duration': duration,
1953 'timescale': stream_timescale,
1954 'width': width or 0,
1955 'height': height or 0,
1957 'codec_private_data': track.get('CodecPrivateData'),
1958 'sampling_rate': sampling_rate,
1959 'channels': int_or_none(track.get('Channels', 2)),
1960 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1961 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1966 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1967 def absolute_url(video_url):
1968 return compat_urlparse.urljoin(base_url, video_url)
1970 def parse_content_type(content_type):
1971 if not content_type:
1973 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1975 mimetype, codecs = ctr.groups()
1976 f = parse_codecs(codecs)
1977 f['ext'] = mimetype2ext(mimetype)
1981 def _media_formats(src, cur_media_type):
1982 full_url = absolute_url(src)
1983 ext = determine_ext(full_url)
1985 is_plain_url = False
1986 formats = self._extract_m3u8_formats(
1987 full_url, video_id, ext='mp4',
1988 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1990 is_plain_url = False
1991 formats = self._extract_mpd_formats(
1992 full_url, video_id, mpd_id=mpd_id)
1997 'vcodec': 'none' if cur_media_type == 'audio' else None,
1999 return is_plain_url, formats
2002 media_tags = [(media_tag, media_type, '')
2003 for media_tag, media_type
2004 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2005 media_tags.extend(re.findall(
2006 # We only allow video|audio followed by a whitespace or '>'.
2007 # Allowing more characters may end up in significant slow down (see
2008 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2009 # http://www.porntrex.com/maps/videositemap.xml).
2010 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2011 for media_tag, media_type, media_content in media_tags:
2016 media_attributes = extract_attributes(media_tag)
2017 src = media_attributes.get('src')
2019 _, formats = _media_formats(src, media_type)
2020 media_info['formats'].extend(formats)
2021 media_info['thumbnail'] = media_attributes.get('poster')
2023 for source_tag in re.findall(r'<source[^>]+>', media_content):
2024 source_attributes = extract_attributes(source_tag)
2025 src = source_attributes.get('src')
2028 is_plain_url, formats = _media_formats(src, media_type)
2030 f = parse_content_type(source_attributes.get('type'))
2031 f.update(formats[0])
2032 media_info['formats'].append(f)
2034 media_info['formats'].extend(formats)
2035 for track_tag in re.findall(r'<track[^>]+>', media_content):
2036 track_attributes = extract_attributes(track_tag)
2037 kind = track_attributes.get('kind')
2038 if not kind or kind in ('subtitles', 'captions'):
2039 src = track_attributes.get('src')
2042 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2043 media_info['subtitles'].setdefault(lang, []).append({
2044 'url': absolute_url(src),
2046 if media_info['formats'] or media_info['subtitles']:
2047 entries.append(media_info)
2050 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2052 hdcore_sign = 'hdcore=3.7.0'
2053 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2054 hds_host = hosts.get('hds')
2056 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2057 if 'hdcore=' not in f4m_url:
2058 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2059 f4m_formats = self._extract_f4m_formats(
2060 f4m_url, video_id, f4m_id='hds', fatal=False)
2061 for entry in f4m_formats:
2062 entry.update({'extra_param_to_segment_url': hdcore_sign})
2063 formats.extend(f4m_formats)
2064 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2065 hls_host = hosts.get('hls')
2067 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2068 formats.extend(self._extract_m3u8_formats(
2069 m3u8_url, video_id, 'mp4', 'm3u8_native',
2070 m3u8_id='hls', fatal=False))
2073 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2074 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2075 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2076 http_base_url = 'http' + url_base
2078 if 'm3u8' not in skip_protocols:
2079 formats.extend(self._extract_m3u8_formats(
2080 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2081 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2082 if 'f4m' not in skip_protocols:
2083 formats.extend(self._extract_f4m_formats(
2084 http_base_url + '/manifest.f4m',
2085 video_id, f4m_id='hds', fatal=False))
2086 if 'dash' not in skip_protocols:
2087 formats.extend(self._extract_mpd_formats(
2088 http_base_url + '/manifest.mpd',
2089 video_id, mpd_id='dash', fatal=False))
2090 if re.search(r'(?:/smil:|\.smil)', url_base):
2091 if 'smil' not in skip_protocols:
2092 rtmp_formats = self._extract_smil_formats(
2093 http_base_url + '/jwplayer.smil',
2094 video_id, fatal=False)
2095 for rtmp_format in rtmp_formats:
2096 rtsp_format = rtmp_format.copy()
2097 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2098 del rtsp_format['play_path']
2099 del rtsp_format['ext']
2100 rtsp_format.update({
2101 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2102 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2105 formats.extend([rtmp_format, rtsp_format])
2107 for protocol in ('rtmp', 'rtsp'):
2108 if protocol not in skip_protocols:
2110 'url': protocol + url_base,
2111 'format_id': protocol,
2112 'protocol': protocol,
2117 def _find_jwplayer_data(webpage):
2119 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2122 return mobj.group('options')
2124 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2125 jwplayer_data = self._parse_json(
2126 self._find_jwplayer_data(webpage), video_id,
2127 transform_source=js_to_json)
2128 return self._parse_jwplayer_data(
2129 jwplayer_data, video_id, *args, **kwargs)
2131 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2132 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2133 # JWPlayer backward compatibility: flattened playlists
2134 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2135 if 'playlist' not in jwplayer_data:
2136 jwplayer_data = {'playlist': [jwplayer_data]}
2140 # JWPlayer backward compatibility: single playlist item
2141 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2142 if not isinstance(jwplayer_data['playlist'], list):
2143 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2145 for video_data in jwplayer_data['playlist']:
2146 # JWPlayer backward compatibility: flattened sources
2147 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2148 if 'sources' not in video_data:
2149 video_data['sources'] = [video_data]
2151 this_video_id = video_id or video_data['mediaid']
2154 for source in video_data['sources']:
2155 source_url = self._proto_relative_url(source['file'])
2157 source_url = compat_urlparse.urljoin(base_url, source_url)
2158 source_type = source.get('type') or ''
2159 ext = mimetype2ext(source_type) or determine_ext(source_url)
2160 if source_type == 'hls' or ext == 'm3u8':
2161 formats.extend(self._extract_m3u8_formats(
2162 source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2164 formats.extend(self._extract_mpd_formats(
2165 source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2166 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2167 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2174 height = int_or_none(source.get('height'))
2176 # Often no height is provided but there is a label in
2177 # format like 1080p.
2178 height = int_or_none(self._search_regex(
2179 r'^(\d{3,})[pP]$', source.get('label') or '',
2180 'height', default=None))
2183 'width': int_or_none(source.get('width')),
2187 if source_url.startswith('rtmp'):
2188 a_format['ext'] = 'flv'
2190 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2191 # of jwplayer.flash.swf
2192 rtmp_url_parts = re.split(
2193 r'((?:mp4|mp3|flv):)', source_url, 1)
2194 if len(rtmp_url_parts) == 3:
2195 rtmp_url, prefix, play_path = rtmp_url_parts
2198 'play_path': prefix + play_path,
2201 a_format.update(rtmp_params)
2202 formats.append(a_format)
2203 self._sort_formats(formats)
2206 tracks = video_data.get('tracks')
2207 if tracks and isinstance(tracks, list):
2208 for track in tracks:
2209 if track.get('kind') != 'captions':
2211 track_url = urljoin(base_url, track.get('file'))
2214 subtitles.setdefault(track.get('label') or 'en', []).append({
2215 'url': self._proto_relative_url(track_url)
2219 'id': this_video_id,
2220 'title': video_data['title'] if require_title else video_data.get('title'),
2221 'description': video_data.get('description'),
2222 'thumbnail': self._proto_relative_url(video_data.get('image')),
2223 'timestamp': int_or_none(video_data.get('pubdate')),
2224 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2225 'subtitles': subtitles,
2228 if len(entries) == 1:
2231 return self.playlist_result(entries)
2233 def _live_title(self, name):
2234 """ Generate the title for a live video """
2235 now = datetime.datetime.now()
2236 now_str = now.strftime('%Y-%m-%d %H:%M')
2237 return name + ' ' + now_str
2239 def _int(self, v, name, fatal=False, **kwargs):
2240 res = int_or_none(v, **kwargs)
2241 if 'get_attr' in kwargs:
2242 print(getattr(v, kwargs['get_attr']))
2244 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2246 raise ExtractorError(msg)
2248 self._downloader.report_warning(msg)
2251 def _float(self, v, name, fatal=False, **kwargs):
2252 res = float_or_none(v, **kwargs)
2254 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2256 raise ExtractorError(msg)
2258 self._downloader.report_warning(msg)
2261 def _set_cookie(self, domain, name, value, expire_time=None):
2262 cookie = compat_cookiejar.Cookie(
2263 0, name, value, None, None, domain, None,
2264 None, '/', True, False, expire_time, '', None, None, None)
2265 self._downloader.cookiejar.set_cookie(cookie)
2267 def _get_cookies(self, url):
2268 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2269 req = sanitized_Request(url)
2270 self._downloader.cookiejar.add_cookie_header(req)
2271 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2273 def get_testcases(self, include_onlymatching=False):
2274 t = getattr(self, '_TEST', None)
2276 assert not hasattr(self, '_TESTS'), \
2277 '%s has _TEST and _TESTS' % type(self).__name__
2280 tests = getattr(self, '_TESTS', [])
2282 if not include_onlymatching and t.get('only_matching', False):
2284 t['name'] = type(self).__name__[:-len('IE')]
2287 def is_suitable(self, age_limit):
2288 """ Test whether the extractor is generally suitable for the given
2289 age limit (i.e. pornographic sites are not, all others usually are) """
2291 any_restricted = False
2292 for tc in self.get_testcases(include_onlymatching=False):
2293 if tc.get('playlist', []):
2294 tc = tc['playlist'][0]
2295 is_restricted = age_restricted(
2296 tc.get('info_dict', {}).get('age_limit'), age_limit)
2297 if not is_restricted:
2299 any_restricted = any_restricted or is_restricted
2300 return not any_restricted
2302 def extract_subtitles(self, *args, **kwargs):
2303 if (self._downloader.params.get('writesubtitles', False) or
2304 self._downloader.params.get('listsubtitles')):
2305 return self._get_subtitles(*args, **kwargs)
2308 def _get_subtitles(self, *args, **kwargs):
2309 raise NotImplementedError('This method must be implemented by subclasses')
2312 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2313 """ Merge subtitle items for one language. Items with duplicated URLs
2314 will be dropped. """
2315 list1_urls = set([item['url'] for item in subtitle_list1])
2316 ret = list(subtitle_list1)
2317 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2321 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2322 """ Merge two subtitle dictionaries, language by language. """
2323 ret = dict(subtitle_dict1)
2324 for lang in subtitle_dict2:
2325 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2328 def extract_automatic_captions(self, *args, **kwargs):
2329 if (self._downloader.params.get('writeautomaticsub', False) or
2330 self._downloader.params.get('listsubtitles')):
2331 return self._get_automatic_captions(*args, **kwargs)
2334 def _get_automatic_captions(self, *args, **kwargs):
2335 raise NotImplementedError('This method must be implemented by subclasses')
2337 def mark_watched(self, *args, **kwargs):
2338 if (self._downloader.params.get('mark_watched', False) and
2339 (self._get_login_info()[0] is not None or
2340 self._downloader.params.get('cookiefile') is not None)):
2341 self._mark_watched(*args, **kwargs)
2343 def _mark_watched(self, *args, **kwargs):
2344 raise NotImplementedError('This method must be implemented by subclasses')
2346 def geo_verification_headers(self):
2348 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2349 if geo_verification_proxy:
2350 headers['Ytdl-request-proxy'] = geo_verification_proxy
2353 def _generic_id(self, url):
2354 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2356 def _generic_title(self, url):
2357 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2360 class SearchInfoExtractor(InfoExtractor):
2362 Base class for paged search queries extractors.
2363 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2364 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2368 def _make_valid_url(cls):
2369 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2372 def suitable(cls, url):
2373 return re.match(cls._make_valid_url(), url) is not None
2375 def _real_extract(self, query):
2376 mobj = re.match(self._make_valid_url(), query)
2378 raise ExtractorError('Invalid search query "%s"' % query)
2380 prefix = mobj.group('prefix')
2381 query = mobj.group('query')
2383 return self._get_n_results(query, 1)
2384 elif prefix == 'all':
2385 return self._get_n_results(query, self._MAX_RESULTS)
2389 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2390 elif n > self._MAX_RESULTS:
2391 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2392 n = self._MAX_RESULTS
2393 return self._get_n_results(query, n)
2395 def _get_n_results(self, query, n):
2396 """Get a specified number of results for a query"""
2397 raise NotImplementedError('This method must be implemented by subclasses')
2400 def SEARCH_KEY(self):
2401 return self._SEARCH_KEY