1 from __future__ import unicode_literals
15 from ..compat import (
18 compat_etree_fromstring,
24 compat_urllib_parse_unquote,
25 compat_urllib_parse_urlencode,
26 compat_urllib_request,
29 from ..downloader.f4m import remove_encrypted_media
58 parse_m3u8_attributes,
64 class InfoExtractor(object):
65 """Information Extractor class.
67 Information extractors are the classes that, given a URL, extract
68 information about the video (or videos) the URL refers to. This
69 information includes the real video URL, the video title, author and
70 others. The information is stored in a dictionary which is then
71 passed to the YoutubeDL. The YoutubeDL processes this
72 information possibly downloading the video to the file system, among
73 other possible outcomes.
75 The type field determines the type of the result.
76 By far the most common value (and the default if _type is missing) is
77 "video", which indicates a single video.
79 For a video, the dictionaries must include the following fields:
82 title: Video title, unescaped.
84 Additionally, it must contain either a formats entry or a url one:
86 formats: A list of dictionaries for each format available, ordered
87 from worst to best quality.
90 * url Mandatory. The URL of the video file
92 The URL of the manifest file in case of
93 fragmented media (DASH, hls, hds)
94 * ext Will be calculated from URL if missing
95 * format A human-readable description of the format
96 ("mp4 container with h264/opus").
97 Calculated from the format_id, width, height.
98 and format_note fields if missing.
99 * format_id A short description of the format
100 ("mp4_h264_opus" or "19").
101 Technically optional, but strongly recommended.
102 * format_note Additional info about the format
103 ("3D" or "DASH video")
104 * width Width of the video, if known
105 * height Height of the video, if known
106 * resolution Textual description of width and height
107 * tbr Average bitrate of audio and video in KBit/s
108 * abr Average audio bitrate in KBit/s
109 * acodec Name of the audio codec in use
110 * asr Audio sampling rate in Hertz
111 * vbr Average video bitrate in KBit/s
113 * vcodec Name of the video codec in use
114 * container Name of the container format
115 * filesize The number of bytes, if known in advance
116 * filesize_approx An estimate for the number of bytes
117 * player_url SWF Player URL (used for rtmpdump).
118 * protocol The protocol that will be used for the actual
119 download, lower-case.
120 "http", "https", "rtsp", "rtmp", "rtmpe",
121 "m3u8", "m3u8_native" or "http_dash_segments".
122 * fragments A list of fragments of the fragmented media,
123 with the following entries:
124 * "url" (mandatory) - fragment's URL
125 * "duration" (optional, int or float)
126 * "filesize" (optional, int)
127 * preference Order number of this format. If this field is
128 present and not None, the formats get sorted
129 by this field, regardless of all other values.
130 -1 for default (order by other properties),
131 -2 or smaller for less than default.
132 < -1000 to hide the format (if there is
133 another one which is strictly better)
134 * language Language code, e.g. "de" or "en-US".
135 * language_preference Is this in the language mentioned in
137 10 if it's what the URL is about,
138 -1 for default (don't know),
139 -10 otherwise, other values reserved for now.
140 * quality Order number of the video quality of this
141 format, irrespective of the file format.
142 -1 for default (order by other properties),
143 -2 or smaller for less than default.
144 * source_preference Order number for this video source
145 (quality takes higher priority)
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 * http_headers A dictionary of additional HTTP headers
149 to add to the request.
150 * stretched_ratio If given and not 1, indicates that the
151 video's pixels are not square.
152 width : height ratio as float.
153 * no_resume The server does not support resuming the
154 (HTTP or RTMP) download. Boolean.
156 url: Final video URL.
157 ext: Video filename extension.
158 format: The video format, defaults to ext (used for --get-format)
159 player_url: SWF Player URL (used for rtmpdump).
161 The following fields are optional:
163 alt_title: A secondary title of the video.
164 display_id An alternative identifier for the video, not necessarily
165 unique, but available before title. Typically, id is
166 something like "4234987", title "Dancing naked mole rats",
167 and display_id "dancing-naked-mole-rats"
168 thumbnails: A list of dictionaries, with the following entries:
169 * "id" (optional, string) - Thumbnail format ID
171 * "preference" (optional, int) - quality of the image
172 * "width" (optional, int)
173 * "height" (optional, int)
174 * "resolution" (optional, string "{width}x{height"},
176 * "filesize" (optional, int)
177 thumbnail: Full URL to a video thumbnail image.
178 description: Full video description.
179 uploader: Full name of the video uploader.
180 license: License name the video is licensed under.
181 creator: The creator of the video.
182 release_date: The date (YYYYMMDD) when the video was released.
183 timestamp: UNIX timestamp of the moment the video became available.
184 upload_date: Video upload date (YYYYMMDD).
185 If not explicitly set, calculated from timestamp.
186 uploader_id: Nickname or id of the video uploader.
187 uploader_url: Full URL to a personal webpage of the video uploader.
188 location: Physical location where the video was filmed.
189 subtitles: The available subtitles as a dictionary in the format
190 {language: subformats}. "subformats" is a list sorted from
191 lower to higher preference, each element is a dictionary
192 with the "ext" entry and one of:
193 * "data": The subtitles file contents
194 * "url": A URL pointing to the subtitles file
195 "ext" will be calculated from URL if missing
196 automatic_captions: Like 'subtitles', used by the YoutubeIE for
197 automatically generated captions
198 duration: Length of the video in seconds, as an integer or float.
199 view_count: How many users have watched the video on the platform.
200 like_count: Number of positive ratings of the video
201 dislike_count: Number of negative ratings of the video
202 repost_count: Number of reposts of the video
203 average_rating: Average rating give by users, the scale used depends on the webpage
204 comment_count: Number of comments on the video
205 comments: A list of comments, each with one or more of the following
206 properties (all but one of text or html optional):
207 * "author" - human-readable name of the comment author
208 * "author_id" - user ID of the comment author
210 * "html" - Comment as HTML
211 * "text" - Plain text of the comment
212 * "timestamp" - UNIX timestamp of comment
213 * "parent" - ID of the comment this one is replying to.
214 Set to "root" to indicate that this is a
215 comment to the original video.
216 age_limit: Age restriction for the video, as an integer (years)
217 webpage_url: The URL to the video webpage, if given to youtube-dl it
218 should allow to get the same result again. (It will be set
219 by YoutubeDL if it's missing)
220 categories: A list of categories that the video falls in, for example
222 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
223 is_live: True, False, or None (=unknown). Whether this video is a
224 live stream that goes on instead of a fixed-length video.
225 start_time: Time in seconds where the reproduction should start, as
226 specified in the URL.
227 end_time: Time in seconds where the reproduction should end, as
228 specified in the URL.
230 The following fields should only be used when the video belongs to some logical
233 chapter: Name or title of the chapter the video belongs to.
234 chapter_number: Number of the chapter the video belongs to, as an integer.
235 chapter_id: Id of the chapter the video belongs to, as a unicode string.
237 The following fields should only be used when the video is an episode of some
240 series: Title of the series or programme the video episode belongs to.
241 season: Title of the season the video episode belongs to.
242 season_number: Number of the season the video episode belongs to, as an integer.
243 season_id: Id of the season the video episode belongs to, as a unicode string.
244 episode: Title of the video episode. Unlike mandatory video title field,
245 this field should denote the exact title of the video episode
246 without any kind of decoration.
247 episode_number: Number of the video episode within a season, as an integer.
248 episode_id: Id of the video episode, as a unicode string.
250 The following fields should only be used when the media is a track or a part of
253 track: Title of the track.
254 track_number: Number of the track within an album or a disc, as an integer.
255 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
257 artist: Artist(s) of the track.
258 genre: Genre(s) of the track.
259 album: Title of the album the track belongs to.
260 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
261 album_artist: List of all artists appeared on the album (e.g.
262 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
264 disc_number: Number of the disc or other physical medium the track belongs to,
266 release_year: Year (YYYY) when the album was released.
268 Unless mentioned otherwise, the fields should be Unicode strings.
270 Unless mentioned otherwise, None is equivalent to absence of information.
273 _type "playlist" indicates multiple videos.
274 There must be a key "entries", which is a list, an iterable, or a PagedList
275 object, each element of which is a valid dictionary by this specification.
277 Additionally, playlists can have "title", "description" and "id" attributes
278 with the same semantics as videos (see above).
281 _type "multi_video" indicates that there are multiple videos that
282 form a single show, for examples multiple acts of an opera or TV episode.
283 It must have an entries key like a playlist and contain all the keys
284 required for a video at the same time.
287 _type "url" indicates that the video must be extracted from another
288 location, possibly by a different extractor. Its only required key is:
289 "url" - the next URL to extract.
290 The key "ie_key" can be set to the class name (minus the trailing "IE",
291 e.g. "Youtube") if the extractor class is known in advance.
292 Additionally, the dictionary may have any properties of the resolved entity
293 known in advance, for example "title" if the title of the referred video is
297 _type "url_transparent" entities have the same specification as "url", but
298 indicate that the given additional information is more precise than the one
299 associated with the resolved URL.
300 This is useful when a site employs a video service that hosts the video and
301 its technical metadata, but that video service does not embed a useful
302 title, description etc.
305 Subclasses of this one should re-define the _real_initialize() and
306 _real_extract() methods and define a _VALID_URL regexp.
307 Probably, they should also be added to the list of extractors.
309 Finally, the _WORKING attribute should be set to False for broken IEs
310 in order to warn the users and skip the tests.
317 def __init__(self, downloader=None):
318 """Constructor. Receives an optional downloader."""
320 self.set_downloader(downloader)
323 def suitable(cls, url):
324 """Receives a URL and returns True if suitable for this IE."""
326 # This does not use has/getattr intentionally - we want to know whether
327 # we have cached the regexp for *this* class, whereas getattr would also
328 # match the superclass
329 if '_VALID_URL_RE' not in cls.__dict__:
330 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
331 return cls._VALID_URL_RE.match(url) is not None
334 def _match_id(cls, url):
335 if '_VALID_URL_RE' not in cls.__dict__:
336 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
337 m = cls._VALID_URL_RE.match(url)
343 """Getter method for _WORKING."""
346 def initialize(self):
347 """Initializes an instance (authentication, etc)."""
349 self._real_initialize()
352 def extract(self, url):
353 """Extracts URL information and returns it in list of dicts."""
356 return self._real_extract(url)
357 except ExtractorError:
359 except compat_http_client.IncompleteRead as e:
360 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
361 except (KeyError, StopIteration) as e:
362 raise ExtractorError('An extractor error has occurred.', cause=e)
364 def set_downloader(self, downloader):
365 """Sets the downloader for this IE."""
366 self._downloader = downloader
368 def _real_initialize(self):
369 """Real initialization process. Redefine in subclasses."""
372 def _real_extract(self, url):
373 """Real extraction process. Redefine in subclasses."""
378 """A string for getting the InfoExtractor with get_info_extractor"""
379 return compat_str(cls.__name__[:-2])
383 return compat_str(type(self).__name__[:-2])
385 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
386 """ Returns the response handle """
388 self.report_download_webpage(video_id)
389 elif note is not False:
391 self.to_screen('%s' % (note,))
393 self.to_screen('%s: %s' % (video_id, note))
394 if isinstance(url_or_request, compat_urllib_request.Request):
395 url_or_request = update_Request(
396 url_or_request, data=data, headers=headers, query=query)
399 url_or_request = update_url_query(url_or_request, query)
400 if data is not None or headers:
401 url_or_request = sanitized_Request(url_or_request, data, headers)
403 return self._downloader.urlopen(url_or_request)
404 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
408 errnote = 'Unable to download webpage'
410 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
412 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
414 self._downloader.report_warning(errmsg)
417 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
418 """ Returns a tuple (page content as string, URL handle) """
419 # Strip hashes from the URL (#1038)
420 if isinstance(url_or_request, (compat_str, str)):
421 url_or_request = url_or_request.partition('#')[0]
423 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
427 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
428 return (content, urlh)
431 def _guess_encoding_from_content(content_type, webpage_bytes):
432 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
434 encoding = m.group(1)
436 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
437 webpage_bytes[:1024])
439 encoding = m.group(1).decode('ascii')
440 elif webpage_bytes.startswith(b'\xff\xfe'):
447 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
448 content_type = urlh.headers.get('Content-Type', '')
449 webpage_bytes = urlh.read()
450 if prefix is not None:
451 webpage_bytes = prefix + webpage_bytes
453 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
454 if self._downloader.params.get('dump_intermediate_pages', False):
456 url = url_or_request.get_full_url()
457 except AttributeError:
459 self.to_screen('Dumping request to ' + url)
460 dump = base64.b64encode(webpage_bytes).decode('ascii')
461 self._downloader.to_screen(dump)
462 if self._downloader.params.get('write_pages', False):
464 url = url_or_request.get_full_url()
465 except AttributeError:
467 basen = '%s_%s' % (video_id, url)
469 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
470 basen = basen[:240 - len(h)] + h
471 raw_filename = basen + '.dump'
472 filename = sanitize_filename(raw_filename, restricted=True)
473 self.to_screen('Saving request to ' + filename)
474 # Working around MAX_PATH limitation on Windows (see
475 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
476 if compat_os_name == 'nt':
477 absfilepath = os.path.abspath(filename)
478 if len(absfilepath) > 259:
479 filename = '\\\\?\\' + absfilepath
480 with open(filename, 'wb') as outf:
481 outf.write(webpage_bytes)
484 content = webpage_bytes.decode(encoding, 'replace')
486 content = webpage_bytes.decode('utf-8', 'replace')
488 if ('<title>Access to this site is blocked</title>' in content and
489 'Websense' in content[:512]):
490 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
491 blocked_iframe = self._html_search_regex(
492 r'<iframe src="([^"]+)"', content,
493 'Websense information URL', default=None)
495 msg += ' Visit %s for more details' % blocked_iframe
496 raise ExtractorError(msg, expected=True)
497 if '<title>The URL you requested has been blocked</title>' in content[:512]:
499 'Access to this webpage has been blocked by Indian censorship. '
500 'Use a VPN or proxy server (with --proxy) to route around it.')
501 block_msg = self._html_search_regex(
502 r'</h1><p>(.*?)</p>',
503 content, 'block message', default=None)
505 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
506 raise ExtractorError(msg, expected=True)
510 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
511 """ Returns the data of the page as a string """
514 while success is False:
516 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
518 except compat_http_client.IncompleteRead as e:
520 if try_count >= tries:
522 self._sleep(timeout, video_id)
529 def _download_xml(self, url_or_request, video_id,
530 note='Downloading XML', errnote='Unable to download XML',
531 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
532 """Return the xml as an xml.etree.ElementTree.Element"""
533 xml_string = self._download_webpage(
534 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
535 if xml_string is False:
538 xml_string = transform_source(xml_string)
539 return compat_etree_fromstring(xml_string.encode('utf-8'))
541 def _download_json(self, url_or_request, video_id,
542 note='Downloading JSON metadata',
543 errnote='Unable to download JSON metadata',
544 transform_source=None,
545 fatal=True, encoding=None, data=None, headers={}, query={}):
546 json_string = self._download_webpage(
547 url_or_request, video_id, note, errnote, fatal=fatal,
548 encoding=encoding, data=data, headers=headers, query=query)
549 if (not fatal) and json_string is False:
551 return self._parse_json(
552 json_string, video_id, transform_source=transform_source, fatal=fatal)
554 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
556 json_string = transform_source(json_string)
558 return json.loads(json_string)
559 except ValueError as ve:
560 errmsg = '%s: Failed to parse JSON ' % video_id
562 raise ExtractorError(errmsg, cause=ve)
564 self.report_warning(errmsg + str(ve))
566 def report_warning(self, msg, video_id=None):
567 idstr = '' if video_id is None else '%s: ' % video_id
568 self._downloader.report_warning(
569 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
571 def to_screen(self, msg):
572 """Print msg to screen, prefixing it with '[ie_name]'"""
573 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
575 def report_extraction(self, id_or_name):
576 """Report information extraction."""
577 self.to_screen('%s: Extracting information' % id_or_name)
579 def report_download_webpage(self, video_id):
580 """Report webpage download."""
581 self.to_screen('%s: Downloading webpage' % video_id)
583 def report_age_confirmation(self):
584 """Report attempt to confirm age."""
585 self.to_screen('Confirming age')
587 def report_login(self):
588 """Report attempt to log in."""
589 self.to_screen('Logging in')
592 def raise_login_required(msg='This video is only available for registered users'):
593 raise ExtractorError(
594 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
598 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
599 raise ExtractorError(
600 '%s. You might want to use --proxy to workaround.' % msg,
603 # Methods for following #608
605 def url_result(url, ie=None, video_id=None, video_title=None):
606 """Returns a URL that points to a page that should be processed"""
607 # TODO: ie should be the class used for getting the info
608 video_info = {'_type': 'url',
611 if video_id is not None:
612 video_info['id'] = video_id
613 if video_title is not None:
614 video_info['title'] = video_title
618 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
619 """Returns a playlist"""
620 video_info = {'_type': 'playlist',
623 video_info['id'] = playlist_id
625 video_info['title'] = playlist_title
626 if playlist_description:
627 video_info['description'] = playlist_description
630 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
632 Perform a regex search on the given string, using a single or a list of
633 patterns returning the first matching group.
634 In case of failure return a default value or raise a WARNING or a
635 RegexNotFoundError, depending on fatal, specifying the field name.
637 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
638 mobj = re.search(pattern, string, flags)
641 mobj = re.search(p, string, flags)
645 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
646 _name = '\033[0;34m%s\033[0m' % name
652 # return the first matching group
653 return next(g for g in mobj.groups() if g is not None)
655 return mobj.group(group)
656 elif default is not NO_DEFAULT:
659 raise RegexNotFoundError('Unable to extract %s' % _name)
661 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
664 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
666 Like _search_regex, but strips HTML tags and unescapes entities.
668 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
670 return clean_html(res).strip()
674 def _get_netrc_login_info(self, netrc_machine=None):
677 netrc_machine = netrc_machine or self._NETRC_MACHINE
679 if self._downloader.params.get('usenetrc', False):
681 info = netrc.netrc().authenticators(netrc_machine)
686 raise netrc.NetrcParseError(
687 'No authenticators for %s' % netrc_machine)
688 except (IOError, netrc.NetrcParseError) as err:
689 self._downloader.report_warning(
690 'parsing .netrc: %s' % error_to_compat_str(err))
692 return username, password
694 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
696 Get the login info as (username, password)
697 First look for the manually specified credentials using username_option
698 and password_option as keys in params dictionary. If no such credentials
699 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
701 If there's no info available, return (None, None)
703 if self._downloader is None:
706 downloader_params = self._downloader.params
708 # Attempt to use provided username and password or .netrc data
709 if downloader_params.get(username_option) is not None:
710 username = downloader_params[username_option]
711 password = downloader_params[password_option]
713 username, password = self._get_netrc_login_info(netrc_machine)
715 return username, password
717 def _get_tfa_info(self, note='two-factor verification code'):
719 Get the two-factor authentication info
720 TODO - asking the user will be required for sms/phone verify
721 currently just uses the command line option
722 If there's no info available, return None
724 if self._downloader is None:
726 downloader_params = self._downloader.params
728 if downloader_params.get('twofactor') is not None:
729 return downloader_params['twofactor']
731 return compat_getpass('Type %s and press [Return]: ' % note)
733 # Helper functions for extracting OpenGraph info
735 def _og_regexes(prop):
736 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
737 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
738 % {'prop': re.escape(prop)})
739 template = r'<meta[^>]+?%s[^>]+?%s'
741 template % (property_re, content_re),
742 template % (content_re, property_re),
746 def _meta_regex(prop):
747 return r'''(?isx)<meta
748 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
749 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
751 def _og_search_property(self, prop, html, name=None, **kargs):
752 if not isinstance(prop, (list, tuple)):
755 name = 'OpenGraph %s' % prop[0]
758 og_regexes.extend(self._og_regexes(p))
759 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
762 return unescapeHTML(escaped)
764 def _og_search_thumbnail(self, html, **kargs):
765 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
767 def _og_search_description(self, html, **kargs):
768 return self._og_search_property('description', html, fatal=False, **kargs)
770 def _og_search_title(self, html, **kargs):
771 return self._og_search_property('title', html, **kargs)
773 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
774 regexes = self._og_regexes('video') + self._og_regexes('video:url')
776 regexes = self._og_regexes('video:secure_url') + regexes
777 return self._html_search_regex(regexes, html, name, **kargs)
779 def _og_search_url(self, html, **kargs):
780 return self._og_search_property('url', html, **kargs)
782 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
783 if not isinstance(name, (list, tuple)):
785 if display_name is None:
786 display_name = name[0]
787 return self._html_search_regex(
788 [self._meta_regex(n) for n in name],
789 html, display_name, fatal=fatal, group='content', **kwargs)
791 def _dc_search_uploader(self, html):
792 return self._html_search_meta('dc.creator', html, 'uploader')
794 def _rta_search(self, html):
795 # See http://www.rtalabel.org/index.php?content=howtofaq#single
796 if re.search(r'(?ix)<meta\s+name="rating"\s+'
797 r' content="RTA-5042-1996-1400-1577-RTA"',
802 def _media_rating_search(self, html):
803 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
804 rating = self._html_search_meta('rating', html)
816 return RATING_TABLE.get(rating.lower())
818 def _family_friendly_search(self, html):
819 # See http://schema.org/VideoObject
820 family_friendly = self._html_search_meta('isFamilyFriendly', html)
822 if not family_friendly:
831 return RATING_TABLE.get(family_friendly.lower())
833 def _twitter_search_player(self, html):
834 return self._html_search_meta('twitter:player', html,
835 'twitter card player')
837 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
838 json_ld = self._search_regex(
839 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
840 html, 'JSON-LD', group='json_ld', **kwargs)
841 default = kwargs.get('default', NO_DEFAULT)
843 return default if default is not NO_DEFAULT else {}
844 # JSON-LD may be malformed and thus `fatal` should be respected.
845 # At the same time `default` may be passed that assumes `fatal=False`
846 # for _search_regex. Let's simulate the same behavior here as well.
847 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
848 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
850 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
851 if isinstance(json_ld, compat_str):
852 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
856 if not isinstance(json_ld, (list, tuple, dict)):
858 if isinstance(json_ld, dict):
861 if e.get('@context') == 'http://schema.org':
862 item_type = e.get('@type')
863 if expected_type is not None and expected_type != item_type:
865 if item_type == 'TVEpisode':
867 'episode': unescapeHTML(e.get('name')),
868 'episode_number': int_or_none(e.get('episodeNumber')),
869 'description': unescapeHTML(e.get('description')),
871 part_of_season = e.get('partOfSeason')
872 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
873 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
874 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
875 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
876 info['series'] = unescapeHTML(part_of_series.get('name'))
877 elif item_type == 'Article':
879 'timestamp': parse_iso8601(e.get('datePublished')),
880 'title': unescapeHTML(e.get('headline')),
881 'description': unescapeHTML(e.get('articleBody')),
883 elif item_type == 'VideoObject':
885 'url': e.get('contentUrl'),
886 'title': unescapeHTML(e.get('name')),
887 'description': unescapeHTML(e.get('description')),
888 'thumbnail': e.get('thumbnailUrl'),
889 'duration': parse_duration(e.get('duration')),
890 'timestamp': unified_timestamp(e.get('uploadDate')),
891 'filesize': float_or_none(e.get('contentSize')),
892 'tbr': int_or_none(e.get('bitrate')),
893 'width': int_or_none(e.get('width')),
894 'height': int_or_none(e.get('height')),
897 return dict((k, v) for k, v in info.items() if v is not None)
900 def _hidden_inputs(html):
901 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
903 for input in re.findall(r'(?i)(<input[^>]+>)', html):
904 attrs = extract_attributes(input)
907 if attrs.get('type') not in ('hidden', 'submit'):
909 name = attrs.get('name') or attrs.get('id')
910 value = attrs.get('value')
911 if name and value is not None:
912 hidden_inputs[name] = value
915 def _form_hidden_inputs(self, form_id, html):
916 form = self._search_regex(
917 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
918 html, '%s form' % form_id, group='form')
919 return self._hidden_inputs(form)
921 def _sort_formats(self, formats, field_preference=None):
923 raise ExtractorError('No video formats found')
926 # Automatically determine tbr when missing based on abr and vbr (improves
927 # formats sorting in some cases)
928 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
929 f['tbr'] = f['abr'] + f['vbr']
932 # TODO remove the following workaround
933 from ..utils import determine_ext
934 if not f.get('ext') and 'url' in f:
935 f['ext'] = determine_ext(f['url'])
937 if isinstance(field_preference, (list, tuple)):
940 if f.get(field) is not None
941 else ('' if field == 'format_id' else -1)
942 for field in field_preference)
944 preference = f.get('preference')
945 if preference is None:
947 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
950 protocol = f.get('protocol') or determine_protocol(f)
951 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
953 if f.get('vcodec') == 'none': # audio only
955 if self._downloader.params.get('prefer_free_formats'):
956 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
958 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
961 audio_ext_preference = ORDER.index(f['ext'])
963 audio_ext_preference = -1
965 if f.get('acodec') == 'none': # video only
967 if self._downloader.params.get('prefer_free_formats'):
968 ORDER = ['flv', 'mp4', 'webm']
970 ORDER = ['webm', 'flv', 'mp4']
972 ext_preference = ORDER.index(f['ext'])
975 audio_ext_preference = 0
979 f.get('language_preference') if f.get('language_preference') is not None else -1,
980 f.get('quality') if f.get('quality') is not None else -1,
981 f.get('tbr') if f.get('tbr') is not None else -1,
982 f.get('filesize') if f.get('filesize') is not None else -1,
983 f.get('vbr') if f.get('vbr') is not None else -1,
984 f.get('height') if f.get('height') is not None else -1,
985 f.get('width') if f.get('width') is not None else -1,
988 f.get('abr') if f.get('abr') is not None else -1,
989 audio_ext_preference,
990 f.get('fps') if f.get('fps') is not None else -1,
991 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
992 f.get('source_preference') if f.get('source_preference') is not None else -1,
993 f.get('format_id') if f.get('format_id') is not None else '',
995 formats.sort(key=_formats_key)
997 def _check_formats(self, formats, video_id):
1000 lambda f: self._is_valid_url(
1002 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1006 def _remove_duplicate_formats(formats):
1010 if f['url'] not in format_urls:
1011 format_urls.add(f['url'])
1012 unique_formats.append(f)
1013 formats[:] = unique_formats
1015 def _is_valid_url(self, url, video_id, item='video'):
1016 url = self._proto_relative_url(url, scheme='http:')
1017 # For now assume non HTTP(S) URLs always valid
1018 if not (url.startswith('http://') or url.startswith('https://')):
1021 self._request_webpage(url, video_id, 'Checking %s URL' % item)
1023 except ExtractorError as e:
1024 if isinstance(e.cause, compat_urllib_error.URLError):
1026 '%s: %s URL is invalid, skipping' % (video_id, item))
1030 def http_scheme(self):
1031 """ Either "http:" or "https:", depending on the user's preferences """
1034 if self._downloader.params.get('prefer_insecure', False)
1037 def _proto_relative_url(self, url, scheme=None):
1040 if url.startswith('//'):
1042 scheme = self.http_scheme()
1047 def _sleep(self, timeout, video_id, msg_template=None):
1048 if msg_template is None:
1049 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1050 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1054 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1055 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1056 fatal=True, m3u8_id=None):
1057 manifest = self._download_xml(
1058 manifest_url, video_id, 'Downloading f4m manifest',
1059 'Unable to download f4m manifest',
1060 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1061 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1062 transform_source=transform_source,
1065 if manifest is False:
1068 return self._parse_f4m_formats(
1069 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1070 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1072 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1073 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1074 fatal=True, m3u8_id=None):
1075 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1076 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1077 if akamai_pv is not None and ';' in akamai_pv.text:
1078 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1079 if playerVerificationChallenge.strip() != '':
1083 manifest_version = '1.0'
1084 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1086 manifest_version = '2.0'
1087 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1088 # Remove unsupported DRM protected media from final formats
1089 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1090 media_nodes = remove_encrypted_media(media_nodes)
1093 base_url = xpath_text(
1094 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1095 'base URL', default=None)
1097 base_url = base_url.strip()
1099 bootstrap_info = xpath_element(
1100 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1101 'bootstrap info', default=None)
1103 for i, media_el in enumerate(media_nodes):
1104 tbr = int_or_none(media_el.attrib.get('bitrate'))
1105 width = int_or_none(media_el.attrib.get('width'))
1106 height = int_or_none(media_el.attrib.get('height'))
1107 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1108 # If <bootstrapInfo> is present, the specified f4m is a
1109 # stream-level manifest, and only set-level manifests may refer to
1110 # external resources. See section 11.4 and section 4 of F4M spec
1111 if bootstrap_info is None:
1113 # @href is introduced in 2.0, see section 11.6 of F4M spec
1114 if manifest_version == '2.0':
1115 media_url = media_el.attrib.get('href')
1116 if media_url is None:
1117 media_url = media_el.attrib.get('url')
1121 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1122 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1123 # If media_url is itself a f4m manifest do the recursive extraction
1124 # since bitrates in parent manifest (this one) and media_url manifest
1125 # may differ leading to inability to resolve the format by requested
1126 # bitrate in f4m downloader
1127 ext = determine_ext(manifest_url)
1129 f4m_formats = self._extract_f4m_formats(
1130 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1131 transform_source=transform_source, fatal=fatal)
1132 # Sometimes stream-level manifest contains single media entry that
1133 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1134 # At the same time parent's media entry in set-level manifest may
1135 # contain it. We will copy it from parent in such cases.
1136 if len(f4m_formats) == 1:
1139 'tbr': f.get('tbr') or tbr,
1140 'width': f.get('width') or width,
1141 'height': f.get('height') or height,
1142 'format_id': f.get('format_id') if not tbr else format_id,
1144 formats.extend(f4m_formats)
1147 formats.extend(self._extract_m3u8_formats(
1148 manifest_url, video_id, 'mp4', preference=preference,
1149 m3u8_id=m3u8_id, fatal=fatal))
1152 'format_id': format_id,
1153 'url': manifest_url,
1154 'manifest_url': manifest_url,
1155 'ext': 'flv' if bootstrap_info is not None else None,
1159 'preference': preference,
1163 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1165 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1169 'preference': preference - 100 if preference else -100,
1170 'resolution': 'multiple',
1171 'format_note': 'Quality selection URL',
1174 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1175 entry_protocol='m3u8', preference=None,
1176 m3u8_id=None, note=None, errnote=None,
1177 fatal=True, live=False):
1179 res = self._download_webpage_handle(
1181 note=note or 'Downloading m3u8 information',
1182 errnote=errnote or 'Failed to download m3u8 information',
1186 m3u8_doc, urlh = res
1187 m3u8_url = urlh.geturl()
1189 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1191 format_url = lambda u: (
1193 if re.match(r'^https?://', u)
1194 else compat_urlparse.urljoin(m3u8_url, u))
1196 # We should try extracting formats only from master playlists [1], i.e.
1197 # playlists that describe available qualities. On the other hand media
1198 # playlists [2] should be returned as is since they contain just the media
1199 # without qualities renditions.
1200 # Fortunately, master playlist can be easily distinguished from media
1201 # playlist based on particular tags availability. As of [1, 2] master
1202 # playlist tags MUST NOT appear in a media playist and vice versa.
1203 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1204 # and MUST NOT appear in master playlist thus we can clearly detect media
1205 # playlist with this criterion.
1206 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1207 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1208 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1209 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1212 'format_id': m3u8_id,
1214 'protocol': entry_protocol,
1215 'preference': preference,
1219 for line in m3u8_doc.splitlines():
1220 if line.startswith('#EXT-X-STREAM-INF:'):
1221 last_info = parse_m3u8_attributes(line)
1222 elif line.startswith('#EXT-X-MEDIA:'):
1223 media = parse_m3u8_attributes(line)
1224 media_type = media.get('TYPE')
1225 if media_type in ('VIDEO', 'AUDIO'):
1226 media_url = media.get('URI')
1229 for v in (media.get('GROUP-ID'), media.get('NAME')):
1233 'format_id': '-'.join(format_id),
1234 'url': format_url(media_url),
1235 'language': media.get('LANGUAGE'),
1236 'vcodec': 'none' if media_type == 'AUDIO' else None,
1238 'protocol': entry_protocol,
1239 'preference': preference,
1242 # When there is no URI in EXT-X-MEDIA let this tag's
1243 # data be used by regular URI lines below
1245 elif line.startswith('#') or not line.strip():
1248 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1251 format_id.append(m3u8_id)
1252 # Despite specification does not mention NAME attribute for
1253 # EXT-X-STREAM-INF it still sometimes may be present
1254 stream_name = last_info.get('NAME') or last_media.get('NAME')
1255 # Bandwidth of live streams may differ over time thus making
1256 # format_id unpredictable. So it's better to keep provided
1259 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1260 manifest_url = format_url(line.strip())
1262 'format_id': '-'.join(format_id),
1263 'url': manifest_url,
1264 'manifest_url': manifest_url,
1267 'fps': float_or_none(last_info.get('FRAME-RATE')),
1268 'protocol': entry_protocol,
1269 'preference': preference,
1271 resolution = last_info.get('RESOLUTION')
1273 width_str, height_str = resolution.split('x')
1274 f['width'] = int(width_str)
1275 f['height'] = int(height_str)
1276 # Unified Streaming Platform
1278 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1280 abr, vbr = mobj.groups()
1281 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1286 f.update(parse_codecs(last_info.get('CODECS')))
1293 def _xpath_ns(path, namespace=None):
1297 for c in path.split('/'):
1298 if not c or c == '.':
1301 out.append('{%s}%s' % (namespace, c))
1302 return '/'.join(out)
1304 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1305 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1311 namespace = self._parse_smil_namespace(smil)
1313 return self._parse_smil_formats(
1314 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1316 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1317 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1320 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1322 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1323 return self._download_xml(
1324 smil_url, video_id, 'Downloading SMIL file',
1325 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1327 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1328 namespace = self._parse_smil_namespace(smil)
1330 formats = self._parse_smil_formats(
1331 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1332 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1334 video_id = os.path.splitext(url_basename(smil_url))[0]
1338 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1339 name = meta.attrib.get('name')
1340 content = meta.attrib.get('content')
1341 if not name or not content:
1343 if not title and name == 'title':
1345 elif not description and name in ('description', 'abstract'):
1346 description = content
1347 elif not upload_date and name == 'date':
1348 upload_date = unified_strdate(content)
1351 'id': image.get('type'),
1352 'url': image.get('src'),
1353 'width': int_or_none(image.get('width')),
1354 'height': int_or_none(image.get('height')),
1355 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1359 'title': title or video_id,
1360 'description': description,
1361 'upload_date': upload_date,
1362 'thumbnails': thumbnails,
1364 'subtitles': subtitles,
1367 def _parse_smil_namespace(self, smil):
1368 return self._search_regex(
1369 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1371 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1373 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1374 b = meta.get('base') or meta.get('httpBase')
1385 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1386 for medium in media:
1387 src = medium.get('src')
1388 if not src or src in srcs:
1392 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1393 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1394 width = int_or_none(medium.get('width'))
1395 height = int_or_none(medium.get('height'))
1396 proto = medium.get('proto')
1397 ext = medium.get('ext')
1398 src_ext = determine_ext(src)
1399 streamer = medium.get('streamer') or base
1401 if proto == 'rtmp' or streamer.startswith('rtmp'):
1407 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1409 'filesize': filesize,
1413 if transform_rtmp_url:
1414 streamer, src = transform_rtmp_url(streamer, src)
1415 formats[-1].update({
1421 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1422 src_url = src_url.strip()
1424 if proto == 'm3u8' or src_ext == 'm3u8':
1425 m3u8_formats = self._extract_m3u8_formats(
1426 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1427 if len(m3u8_formats) == 1:
1429 m3u8_formats[0].update({
1430 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1435 formats.extend(m3u8_formats)
1438 if src_ext == 'f4m':
1443 'plugin': 'flowplayer-3.2.0.1',
1445 f4m_url += '&' if '?' in f4m_url else '?'
1446 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1447 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1450 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1454 'ext': ext or src_ext or 'flv',
1455 'format_id': 'http-%d' % (bitrate or http_count),
1457 'filesize': filesize,
1465 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1468 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1469 src = textstream.get('src')
1470 if not src or src in urls:
1473 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1474 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1475 subtitles.setdefault(lang, []).append({
1481 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1482 xspf = self._download_xml(
1483 playlist_url, playlist_id, 'Downloading xpsf playlist',
1484 'Unable to download xspf manifest', fatal=fatal)
1487 return self._parse_xspf(xspf, playlist_id)
1489 def _parse_xspf(self, playlist, playlist_id):
1491 'xspf': 'http://xspf.org/ns/0/',
1492 's1': 'http://static.streamone.nl/player/ns/0',
1496 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1498 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1499 description = xpath_text(
1500 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1501 thumbnail = xpath_text(
1502 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1503 duration = float_or_none(
1504 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1507 'url': location.text,
1508 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1509 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1510 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1511 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1512 self._sort_formats(formats)
1517 'description': description,
1518 'thumbnail': thumbnail,
1519 'duration': duration,
1524 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1525 res = self._download_webpage_handle(
1527 note=note or 'Downloading MPD manifest',
1528 errnote=errnote or 'Failed to download MPD manifest',
1533 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1535 return self._parse_mpd_formats(
1536 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1537 formats_dict=formats_dict, mpd_url=mpd_url)
1539 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1541 Parse formats from MPD manifest.
1543 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1544 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1545 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1547 if mpd_doc.get('type') == 'dynamic':
1550 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1553 return self._xpath_ns(path, namespace)
1555 def is_drm_protected(element):
1556 return element.find(_add_ns('ContentProtection')) is not None
1558 def extract_multisegment_info(element, ms_parent_info):
1559 ms_info = ms_parent_info.copy()
1561 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1562 # common attributes and elements. We will only extract relevant
1564 def extract_common(source):
1565 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1566 if segment_timeline is not None:
1567 s_e = segment_timeline.findall(_add_ns('S'))
1569 ms_info['total_number'] = 0
1572 r = int(s.get('r', 0))
1573 ms_info['total_number'] += 1 + r
1574 ms_info['s'].append({
1575 't': int(s.get('t', 0)),
1576 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1577 'd': int(s.attrib['d']),
1580 start_number = source.get('startNumber')
1582 ms_info['start_number'] = int(start_number)
1583 timescale = source.get('timescale')
1585 ms_info['timescale'] = int(timescale)
1586 segment_duration = source.get('duration')
1587 if segment_duration:
1588 ms_info['segment_duration'] = int(segment_duration)
1590 def extract_Initialization(source):
1591 initialization = source.find(_add_ns('Initialization'))
1592 if initialization is not None:
1593 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1595 segment_list = element.find(_add_ns('SegmentList'))
1596 if segment_list is not None:
1597 extract_common(segment_list)
1598 extract_Initialization(segment_list)
1599 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1601 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1603 segment_template = element.find(_add_ns('SegmentTemplate'))
1604 if segment_template is not None:
1605 extract_common(segment_template)
1606 media_template = segment_template.get('media')
1608 ms_info['media_template'] = media_template
1609 initialization = segment_template.get('initialization')
1611 ms_info['initialization_url'] = initialization
1613 extract_Initialization(segment_template)
1616 def combine_url(base_url, target_url):
1617 if re.match(r'^https?://', target_url):
1619 return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
1621 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1623 for period in mpd_doc.findall(_add_ns('Period')):
1624 period_duration = parse_duration(period.get('duration')) or mpd_duration
1625 period_ms_info = extract_multisegment_info(period, {
1629 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1630 if is_drm_protected(adaptation_set):
1632 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1633 for representation in adaptation_set.findall(_add_ns('Representation')):
1634 if is_drm_protected(representation):
1636 representation_attrib = adaptation_set.attrib.copy()
1637 representation_attrib.update(representation.attrib)
1638 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1639 mime_type = representation_attrib['mimeType']
1640 content_type = mime_type.split('/')[0]
1641 if content_type == 'text':
1642 # TODO implement WebVTT downloading
1644 elif content_type == 'video' or content_type == 'audio':
1646 for element in (representation, adaptation_set, period, mpd_doc):
1647 base_url_e = element.find(_add_ns('BaseURL'))
1648 if base_url_e is not None:
1649 base_url = base_url_e.text + base_url
1650 if re.match(r'^https?://', base_url):
1652 if mpd_base_url and not re.match(r'^https?://', base_url):
1653 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1655 base_url = mpd_base_url + base_url
1656 representation_id = representation_attrib.get('id')
1657 lang = representation_attrib.get('lang')
1658 url_el = representation.find(_add_ns('BaseURL'))
1659 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1661 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1663 'manifest_url': mpd_url,
1664 'ext': mimetype2ext(mime_type),
1665 'width': int_or_none(representation_attrib.get('width')),
1666 'height': int_or_none(representation_attrib.get('height')),
1667 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1668 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1669 'fps': int_or_none(representation_attrib.get('frameRate')),
1670 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1671 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1672 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1673 'format_note': 'DASH %s' % content_type,
1674 'filesize': filesize,
1676 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1677 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1679 media_template = representation_ms_info['media_template']
1680 media_template = media_template.replace('$RepresentationID$', representation_id)
1681 media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1682 media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1683 media_template.replace('$$', '$')
1685 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1686 # can't be used at the same time
1687 if '%(Number' in media_template and 's' not in representation_ms_info:
1688 segment_duration = None
1689 if 'total_number' not in representation_ms_info and 'segment_duration':
1690 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1691 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1692 representation_ms_info['fragments'] = [{
1693 'url': media_template % {
1694 'Number': segment_number,
1695 'Bandwidth': representation_attrib.get('bandwidth'),
1697 'duration': segment_duration,
1698 } for segment_number in range(
1699 representation_ms_info['start_number'],
1700 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1702 # $Number*$ or $Time$ in media template with S list available
1703 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1704 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1705 representation_ms_info['fragments'] = []
1708 segment_number = representation_ms_info['start_number']
1710 def add_segment_url():
1711 segment_url = media_template % {
1712 'Time': segment_time,
1713 'Bandwidth': representation_attrib.get('bandwidth'),
1714 'Number': segment_number,
1716 representation_ms_info['fragments'].append({
1718 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1721 for num, s in enumerate(representation_ms_info['s']):
1722 segment_time = s.get('t') or segment_time
1726 for r in range(s.get('r', 0)):
1727 segment_time += segment_d
1730 segment_time += segment_d
1731 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1733 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1734 # or any YouTube dashsegments video
1737 for segment_url in representation_ms_info['segment_urls']:
1738 s = representation_ms_info['s'][s_num]
1739 for r in range(s.get('r', 0) + 1):
1742 'duration': float_or_none(s['d'], representation_ms_info['timescale']),
1744 representation_ms_info['fragments'] = fragments
1745 # NB: MPD manifest may contain direct URLs to unfragmented media.
1746 # No fragments key is present in this case.
1747 if 'fragments' in representation_ms_info:
1750 'protocol': 'http_dash_segments',
1752 if 'initialization_url' in representation_ms_info:
1753 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1754 if not f.get('url'):
1755 f['url'] = initialization_url
1756 f['fragments'].append({'url': initialization_url})
1757 f['fragments'].extend(representation_ms_info['fragments'])
1758 for fragment in f['fragments']:
1759 fragment['url'] = combine_url(base_url, fragment['url'])
1761 existing_format = next(
1762 fo for fo in formats
1763 if fo['format_id'] == representation_id)
1764 except StopIteration:
1765 full_info = formats_dict.get(representation_id, {}).copy()
1767 formats.append(full_info)
1769 existing_format.update(f)
1771 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1774 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
1775 def absolute_url(video_url):
1776 return compat_urlparse.urljoin(base_url, video_url)
1778 def parse_content_type(content_type):
1779 if not content_type:
1781 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1783 mimetype, codecs = ctr.groups()
1784 f = parse_codecs(codecs)
1785 f['ext'] = mimetype2ext(mimetype)
1789 def _media_formats(src, cur_media_type):
1790 full_url = absolute_url(src)
1791 if determine_ext(full_url) == 'm3u8':
1792 is_plain_url = False
1793 formats = self._extract_m3u8_formats(
1794 full_url, video_id, ext='mp4',
1795 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1800 'vcodec': 'none' if cur_media_type == 'audio' else None,
1802 return is_plain_url, formats
1805 media_tags = [(media_tag, media_type, '')
1806 for media_tag, media_type
1807 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
1808 media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
1809 for media_tag, media_type, media_content in media_tags:
1814 media_attributes = extract_attributes(media_tag)
1815 src = media_attributes.get('src')
1817 _, formats = _media_formats(src, media_type)
1818 media_info['formats'].extend(formats)
1819 media_info['thumbnail'] = media_attributes.get('poster')
1821 for source_tag in re.findall(r'<source[^>]+>', media_content):
1822 source_attributes = extract_attributes(source_tag)
1823 src = source_attributes.get('src')
1826 is_plain_url, formats = _media_formats(src, media_type)
1828 f = parse_content_type(source_attributes.get('type'))
1829 f.update(formats[0])
1830 media_info['formats'].append(f)
1832 media_info['formats'].extend(formats)
1833 for track_tag in re.findall(r'<track[^>]+>', media_content):
1834 track_attributes = extract_attributes(track_tag)
1835 kind = track_attributes.get('kind')
1836 if not kind or kind in ('subtitles', 'captions'):
1837 src = track_attributes.get('src')
1840 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1841 media_info['subtitles'].setdefault(lang, []).append({
1842 'url': absolute_url(src),
1844 if media_info['formats'] or media_info['subtitles']:
1845 entries.append(media_info)
1848 def _extract_akamai_formats(self, manifest_url, video_id):
1850 hdcore_sign = 'hdcore=3.7.0'
1851 f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1852 if 'hdcore=' not in f4m_url:
1853 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
1854 f4m_formats = self._extract_f4m_formats(
1855 f4m_url, video_id, f4m_id='hds', fatal=False)
1856 for entry in f4m_formats:
1857 entry.update({'extra_param_to_segment_url': hdcore_sign})
1858 formats.extend(f4m_formats)
1859 m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1860 formats.extend(self._extract_m3u8_formats(
1861 m3u8_url, video_id, 'mp4', 'm3u8_native',
1862 m3u8_id='hls', fatal=False))
1865 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1866 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1867 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1868 http_base_url = 'http' + url_base
1870 if 'm3u8' not in skip_protocols:
1871 formats.extend(self._extract_m3u8_formats(
1872 http_base_url + '/playlist.m3u8', video_id, 'mp4',
1873 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
1874 if 'f4m' not in skip_protocols:
1875 formats.extend(self._extract_f4m_formats(
1876 http_base_url + '/manifest.f4m',
1877 video_id, f4m_id='hds', fatal=False))
1878 if re.search(r'(?:/smil:|\.smil)', url_base):
1879 if 'dash' not in skip_protocols:
1880 formats.extend(self._extract_mpd_formats(
1881 http_base_url + '/manifest.mpd',
1882 video_id, mpd_id='dash', fatal=False))
1883 if 'smil' not in skip_protocols:
1884 rtmp_formats = self._extract_smil_formats(
1885 http_base_url + '/jwplayer.smil',
1886 video_id, fatal=False)
1887 for rtmp_format in rtmp_formats:
1888 rtsp_format = rtmp_format.copy()
1889 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
1890 del rtsp_format['play_path']
1891 del rtsp_format['ext']
1892 rtsp_format.update({
1893 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
1894 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
1897 formats.extend([rtmp_format, rtsp_format])
1899 for protocol in ('rtmp', 'rtsp'):
1900 if protocol not in skip_protocols:
1902 'url': protocol + url_base,
1903 'format_id': protocol,
1904 'protocol': protocol,
1908 def _live_title(self, name):
1909 """ Generate the title for a live video """
1910 now = datetime.datetime.now()
1911 now_str = now.strftime('%Y-%m-%d %H:%M')
1912 return name + ' ' + now_str
1914 def _int(self, v, name, fatal=False, **kwargs):
1915 res = int_or_none(v, **kwargs)
1916 if 'get_attr' in kwargs:
1917 print(getattr(v, kwargs['get_attr']))
1919 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1921 raise ExtractorError(msg)
1923 self._downloader.report_warning(msg)
1926 def _float(self, v, name, fatal=False, **kwargs):
1927 res = float_or_none(v, **kwargs)
1929 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1931 raise ExtractorError(msg)
1933 self._downloader.report_warning(msg)
1936 def _set_cookie(self, domain, name, value, expire_time=None):
1937 cookie = compat_cookiejar.Cookie(
1938 0, name, value, None, None, domain, None,
1939 None, '/', True, False, expire_time, '', None, None, None)
1940 self._downloader.cookiejar.set_cookie(cookie)
1942 def _get_cookies(self, url):
1943 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1944 req = sanitized_Request(url)
1945 self._downloader.cookiejar.add_cookie_header(req)
1946 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1948 def get_testcases(self, include_onlymatching=False):
1949 t = getattr(self, '_TEST', None)
1951 assert not hasattr(self, '_TESTS'), \
1952 '%s has _TEST and _TESTS' % type(self).__name__
1955 tests = getattr(self, '_TESTS', [])
1957 if not include_onlymatching and t.get('only_matching', False):
1959 t['name'] = type(self).__name__[:-len('IE')]
1962 def is_suitable(self, age_limit):
1963 """ Test whether the extractor is generally suitable for the given
1964 age limit (i.e. pornographic sites are not, all others usually are) """
1966 any_restricted = False
1967 for tc in self.get_testcases(include_onlymatching=False):
1968 if tc.get('playlist', []):
1969 tc = tc['playlist'][0]
1970 is_restricted = age_restricted(
1971 tc.get('info_dict', {}).get('age_limit'), age_limit)
1972 if not is_restricted:
1974 any_restricted = any_restricted or is_restricted
1975 return not any_restricted
1977 def extract_subtitles(self, *args, **kwargs):
1978 if (self._downloader.params.get('writesubtitles', False) or
1979 self._downloader.params.get('listsubtitles')):
1980 return self._get_subtitles(*args, **kwargs)
1983 def _get_subtitles(self, *args, **kwargs):
1984 raise NotImplementedError('This method must be implemented by subclasses')
1987 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1988 """ Merge subtitle items for one language. Items with duplicated URLs
1989 will be dropped. """
1990 list1_urls = set([item['url'] for item in subtitle_list1])
1991 ret = list(subtitle_list1)
1992 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1996 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1997 """ Merge two subtitle dictionaries, language by language. """
1998 ret = dict(subtitle_dict1)
1999 for lang in subtitle_dict2:
2000 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2003 def extract_automatic_captions(self, *args, **kwargs):
2004 if (self._downloader.params.get('writeautomaticsub', False) or
2005 self._downloader.params.get('listsubtitles')):
2006 return self._get_automatic_captions(*args, **kwargs)
2009 def _get_automatic_captions(self, *args, **kwargs):
2010 raise NotImplementedError('This method must be implemented by subclasses')
2012 def mark_watched(self, *args, **kwargs):
2013 if (self._downloader.params.get('mark_watched', False) and
2014 (self._get_login_info()[0] is not None or
2015 self._downloader.params.get('cookiefile') is not None)):
2016 self._mark_watched(*args, **kwargs)
2018 def _mark_watched(self, *args, **kwargs):
2019 raise NotImplementedError('This method must be implemented by subclasses')
2021 def geo_verification_headers(self):
2023 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2024 if geo_verification_proxy:
2025 headers['Ytdl-request-proxy'] = geo_verification_proxy
2028 def _generic_id(self, url):
2029 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2031 def _generic_title(self, url):
2032 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2035 class SearchInfoExtractor(InfoExtractor):
2037 Base class for paged search queries extractors.
2038 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2039 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2043 def _make_valid_url(cls):
2044 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2047 def suitable(cls, url):
2048 return re.match(cls._make_valid_url(), url) is not None
2050 def _real_extract(self, query):
2051 mobj = re.match(self._make_valid_url(), query)
2053 raise ExtractorError('Invalid search query "%s"' % query)
2055 prefix = mobj.group('prefix')
2056 query = mobj.group('query')
2058 return self._get_n_results(query, 1)
2059 elif prefix == 'all':
2060 return self._get_n_results(query, self._MAX_RESULTS)
2064 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2065 elif n > self._MAX_RESULTS:
2066 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2067 n = self._MAX_RESULTS
2068 return self._get_n_results(query, n)
2070 def _get_n_results(self, query, n):
2071 """Get a specified number of results for a query"""
2072 raise NotImplementedError('This method must be implemented by subclasses')
2075 def SEARCH_KEY(self):
2076 return self._SEARCH_KEY