1 from __future__ import unicode_literals
15 from ..compat import (
18 compat_etree_fromstring,
54 class InfoExtractor(object):
55 """Information Extractor class.
57 Information extractors are the classes that, given a URL, extract
58 information about the video (or videos) the URL refers to. This
59 information includes the real video URL, the video title, author and
60 others. The information is stored in a dictionary which is then
61 passed to the YoutubeDL. The YoutubeDL processes this
62 information possibly downloading the video to the file system, among
63 other possible outcomes.
65 The type field determines the type of the result.
66 By far the most common value (and the default if _type is missing) is
67 "video", which indicates a single video.
69 For a video, the dictionaries must include the following fields:
72 title: Video title, unescaped.
74 Additionally, it must contain either a formats entry or a url one:
76 formats: A list of dictionaries for each format available, ordered
77 from worst to best quality.
80 * url Mandatory. The URL of the video file
81 * ext Will be calculated from URL if missing
82 * format A human-readable description of the format
83 ("mp4 container with h264/opus").
84 Calculated from the format_id, width, height.
85 and format_note fields if missing.
86 * format_id A short description of the format
87 ("mp4_h264_opus" or "19").
88 Technically optional, but strongly recommended.
89 * format_note Additional info about the format
90 ("3D" or "DASH video")
91 * width Width of the video, if known
92 * height Height of the video, if known
93 * resolution Textual description of width and height
94 * tbr Average bitrate of audio and video in KBit/s
95 * abr Average audio bitrate in KBit/s
96 * acodec Name of the audio codec in use
97 * asr Audio sampling rate in Hertz
98 * vbr Average video bitrate in KBit/s
100 * vcodec Name of the video codec in use
101 * container Name of the container format
102 * filesize The number of bytes, if known in advance
103 * filesize_approx An estimate for the number of bytes
104 * player_url SWF Player URL (used for rtmpdump).
105 * protocol The protocol that will be used for the actual
106 download, lower-case.
107 "http", "https", "rtsp", "rtmp", "rtmpe",
108 "m3u8", or "m3u8_native".
109 * preference Order number of this format. If this field is
110 present and not None, the formats get sorted
111 by this field, regardless of all other values.
112 -1 for default (order by other properties),
113 -2 or smaller for less than default.
114 < -1000 to hide the format (if there is
115 another one which is strictly better)
116 * language Language code, e.g. "de" or "en-US".
117 * language_preference Is this in the language mentioned in
119 10 if it's what the URL is about,
120 -1 for default (don't know),
121 -10 otherwise, other values reserved for now.
122 * quality Order number of the video quality of this
123 format, irrespective of the file format.
124 -1 for default (order by other properties),
125 -2 or smaller for less than default.
126 * source_preference Order number for this video source
127 (quality takes higher priority)
128 -1 for default (order by other properties),
129 -2 or smaller for less than default.
130 * http_headers A dictionary of additional HTTP headers
131 to add to the request.
132 * stretched_ratio If given and not 1, indicates that the
133 video's pixels are not square.
134 width : height ratio as float.
135 * no_resume The server does not support resuming the
136 (HTTP or RTMP) download. Boolean.
138 url: Final video URL.
139 ext: Video filename extension.
140 format: The video format, defaults to ext (used for --get-format)
141 player_url: SWF Player URL (used for rtmpdump).
143 The following fields are optional:
145 alt_title: A secondary title of the video.
146 display_id An alternative identifier for the video, not necessarily
147 unique, but available before title. Typically, id is
148 something like "4234987", title "Dancing naked mole rats",
149 and display_id "dancing-naked-mole-rats"
150 thumbnails: A list of dictionaries, with the following entries:
151 * "id" (optional, string) - Thumbnail format ID
153 * "preference" (optional, int) - quality of the image
154 * "width" (optional, int)
155 * "height" (optional, int)
156 * "resolution" (optional, string "{width}x{height"},
158 thumbnail: Full URL to a video thumbnail image.
159 description: Full video description.
160 uploader: Full name of the video uploader.
161 license: License name the video is licensed under.
162 creator: The main artist who created the video.
163 release_date: The date (YYYYMMDD) when the video was released.
164 timestamp: UNIX timestamp of the moment the video became available.
165 upload_date: Video upload date (YYYYMMDD).
166 If not explicitly set, calculated from timestamp.
167 uploader_id: Nickname or id of the video uploader.
168 uploader_url: Full URL to a personal webpage of the video uploader.
169 location: Physical location where the video was filmed.
170 subtitles: The available subtitles as a dictionary in the format
171 {language: subformats}. "subformats" is a list sorted from
172 lower to higher preference, each element is a dictionary
173 with the "ext" entry and one of:
174 * "data": The subtitles file contents
175 * "url": A URL pointing to the subtitles file
176 "ext" will be calculated from URL if missing
177 automatic_captions: Like 'subtitles', used by the YoutubeIE for
178 automatically generated captions
179 duration: Length of the video in seconds, as an integer or float.
180 view_count: How many users have watched the video on the platform.
181 like_count: Number of positive ratings of the video
182 dislike_count: Number of negative ratings of the video
183 repost_count: Number of reposts of the video
184 average_rating: Average rating give by users, the scale used depends on the webpage
185 comment_count: Number of comments on the video
186 comments: A list of comments, each with one or more of the following
187 properties (all but one of text or html optional):
188 * "author" - human-readable name of the comment author
189 * "author_id" - user ID of the comment author
191 * "html" - Comment as HTML
192 * "text" - Plain text of the comment
193 * "timestamp" - UNIX timestamp of comment
194 * "parent" - ID of the comment this one is replying to.
195 Set to "root" to indicate that this is a
196 comment to the original video.
197 age_limit: Age restriction for the video, as an integer (years)
198 webpage_url: The URL to the video webpage, if given to youtube-dl it
199 should allow to get the same result again. (It will be set
200 by YoutubeDL if it's missing)
201 categories: A list of categories that the video falls in, for example
203 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
204 is_live: True, False, or None (=unknown). Whether this video is a
205 live stream that goes on instead of a fixed-length video.
206 start_time: Time in seconds where the reproduction should start, as
207 specified in the URL.
208 end_time: Time in seconds where the reproduction should end, as
209 specified in the URL.
211 The following fields should only be used when the video belongs to some logical
214 chapter: Name or title of the chapter the video belongs to.
215 chapter_number: Number of the chapter the video belongs to, as an integer.
216 chapter_id: Id of the chapter the video belongs to, as a unicode string.
218 The following fields should only be used when the video is an episode of some
221 series: Title of the series or programme the video episode belongs to.
222 season: Title of the season the video episode belongs to.
223 season_number: Number of the season the video episode belongs to, as an integer.
224 season_id: Id of the season the video episode belongs to, as a unicode string.
225 episode: Title of the video episode. Unlike mandatory video title field,
226 this field should denote the exact title of the video episode
227 without any kind of decoration.
228 episode_number: Number of the video episode within a season, as an integer.
229 episode_id: Id of the video episode, as a unicode string.
231 Unless mentioned otherwise, the fields should be Unicode strings.
233 Unless mentioned otherwise, None is equivalent to absence of information.
236 _type "playlist" indicates multiple videos.
237 There must be a key "entries", which is a list, an iterable, or a PagedList
238 object, each element of which is a valid dictionary by this specification.
240 Additionally, playlists can have "title", "description" and "id" attributes
241 with the same semantics as videos (see above).
244 _type "multi_video" indicates that there are multiple videos that
245 form a single show, for examples multiple acts of an opera or TV episode.
246 It must have an entries key like a playlist and contain all the keys
247 required for a video at the same time.
250 _type "url" indicates that the video must be extracted from another
251 location, possibly by a different extractor. Its only required key is:
252 "url" - the next URL to extract.
253 The key "ie_key" can be set to the class name (minus the trailing "IE",
254 e.g. "Youtube") if the extractor class is known in advance.
255 Additionally, the dictionary may have any properties of the resolved entity
256 known in advance, for example "title" if the title of the referred video is
260 _type "url_transparent" entities have the same specification as "url", but
261 indicate that the given additional information is more precise than the one
262 associated with the resolved URL.
263 This is useful when a site employs a video service that hosts the video and
264 its technical metadata, but that video service does not embed a useful
265 title, description etc.
268 Subclasses of this one should re-define the _real_initialize() and
269 _real_extract() methods and define a _VALID_URL regexp.
270 Probably, they should also be added to the list of extractors.
272 Finally, the _WORKING attribute should be set to False for broken IEs
273 in order to warn the users and skip the tests.
280 def __init__(self, downloader=None):
281 """Constructor. Receives an optional downloader."""
283 self.set_downloader(downloader)
286 def suitable(cls, url):
287 """Receives a URL and returns True if suitable for this IE."""
289 # This does not use has/getattr intentionally - we want to know whether
290 # we have cached the regexp for *this* class, whereas getattr would also
291 # match the superclass
292 if '_VALID_URL_RE' not in cls.__dict__:
293 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
294 return cls._VALID_URL_RE.match(url) is not None
297 def _match_id(cls, url):
298 if '_VALID_URL_RE' not in cls.__dict__:
299 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
300 m = cls._VALID_URL_RE.match(url)
306 """Getter method for _WORKING."""
309 def initialize(self):
310 """Initializes an instance (authentication, etc)."""
312 self._real_initialize()
315 def extract(self, url):
316 """Extracts URL information and returns it in list of dicts."""
319 return self._real_extract(url)
320 except ExtractorError:
322 except compat_http_client.IncompleteRead as e:
323 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
324 except (KeyError, StopIteration) as e:
325 raise ExtractorError('An extractor error has occurred.', cause=e)
327 def set_downloader(self, downloader):
328 """Sets the downloader for this IE."""
329 self._downloader = downloader
331 def _real_initialize(self):
332 """Real initialization process. Redefine in subclasses."""
335 def _real_extract(self, url):
336 """Real extraction process. Redefine in subclasses."""
341 """A string for getting the InfoExtractor with get_info_extractor"""
342 return compat_str(cls.__name__[:-2])
346 return compat_str(type(self).__name__[:-2])
348 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
349 """ Returns the response handle """
351 self.report_download_webpage(video_id)
352 elif note is not False:
354 self.to_screen('%s' % (note,))
356 self.to_screen('%s: %s' % (video_id, note))
358 return self._downloader.urlopen(url_or_request)
359 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
363 errnote = 'Unable to download webpage'
365 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
367 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
369 self._downloader.report_warning(errmsg)
372 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
373 """ Returns a tuple (page content as string, URL handle) """
374 # Strip hashes from the URL (#1038)
375 if isinstance(url_or_request, (compat_str, str)):
376 url_or_request = url_or_request.partition('#')[0]
378 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
382 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
383 return (content, urlh)
386 def _guess_encoding_from_content(content_type, webpage_bytes):
387 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
389 encoding = m.group(1)
391 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
392 webpage_bytes[:1024])
394 encoding = m.group(1).decode('ascii')
395 elif webpage_bytes.startswith(b'\xff\xfe'):
402 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
403 content_type = urlh.headers.get('Content-Type', '')
404 webpage_bytes = urlh.read()
405 if prefix is not None:
406 webpage_bytes = prefix + webpage_bytes
408 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
409 if self._downloader.params.get('dump_intermediate_pages', False):
411 url = url_or_request.get_full_url()
412 except AttributeError:
414 self.to_screen('Dumping request to ' + url)
415 dump = base64.b64encode(webpage_bytes).decode('ascii')
416 self._downloader.to_screen(dump)
417 if self._downloader.params.get('write_pages', False):
419 url = url_or_request.get_full_url()
420 except AttributeError:
422 basen = '%s_%s' % (video_id, url)
424 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
425 basen = basen[:240 - len(h)] + h
426 raw_filename = basen + '.dump'
427 filename = sanitize_filename(raw_filename, restricted=True)
428 self.to_screen('Saving request to ' + filename)
429 # Working around MAX_PATH limitation on Windows (see
430 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
431 if compat_os_name == 'nt':
432 absfilepath = os.path.abspath(filename)
433 if len(absfilepath) > 259:
434 filename = '\\\\?\\' + absfilepath
435 with open(filename, 'wb') as outf:
436 outf.write(webpage_bytes)
439 content = webpage_bytes.decode(encoding, 'replace')
441 content = webpage_bytes.decode('utf-8', 'replace')
443 if ('<title>Access to this site is blocked</title>' in content and
444 'Websense' in content[:512]):
445 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
446 blocked_iframe = self._html_search_regex(
447 r'<iframe src="([^"]+)"', content,
448 'Websense information URL', default=None)
450 msg += ' Visit %s for more details' % blocked_iframe
451 raise ExtractorError(msg, expected=True)
452 if '<title>The URL you requested has been blocked</title>' in content[:512]:
454 'Access to this webpage has been blocked by Indian censorship. '
455 'Use a VPN or proxy server (with --proxy) to route around it.')
456 block_msg = self._html_search_regex(
457 r'</h1><p>(.*?)</p>',
458 content, 'block message', default=None)
460 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
461 raise ExtractorError(msg, expected=True)
465 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
466 """ Returns the data of the page as a string """
469 while success is False:
471 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
473 except compat_http_client.IncompleteRead as e:
475 if try_count >= tries:
477 self._sleep(timeout, video_id)
484 def _download_xml(self, url_or_request, video_id,
485 note='Downloading XML', errnote='Unable to download XML',
486 transform_source=None, fatal=True, encoding=None):
487 """Return the xml as an xml.etree.ElementTree.Element"""
488 xml_string = self._download_webpage(
489 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
490 if xml_string is False:
493 xml_string = transform_source(xml_string)
494 return compat_etree_fromstring(xml_string.encode('utf-8'))
496 def _download_json(self, url_or_request, video_id,
497 note='Downloading JSON metadata',
498 errnote='Unable to download JSON metadata',
499 transform_source=None,
500 fatal=True, encoding=None):
501 json_string = self._download_webpage(
502 url_or_request, video_id, note, errnote, fatal=fatal,
504 if (not fatal) and json_string is False:
506 return self._parse_json(
507 json_string, video_id, transform_source=transform_source, fatal=fatal)
509 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
511 json_string = transform_source(json_string)
513 return json.loads(json_string)
514 except ValueError as ve:
515 errmsg = '%s: Failed to parse JSON ' % video_id
517 raise ExtractorError(errmsg, cause=ve)
519 self.report_warning(errmsg + str(ve))
521 def report_warning(self, msg, video_id=None):
522 idstr = '' if video_id is None else '%s: ' % video_id
523 self._downloader.report_warning(
524 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
526 def to_screen(self, msg):
527 """Print msg to screen, prefixing it with '[ie_name]'"""
528 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
530 def report_extraction(self, id_or_name):
531 """Report information extraction."""
532 self.to_screen('%s: Extracting information' % id_or_name)
534 def report_download_webpage(self, video_id):
535 """Report webpage download."""
536 self.to_screen('%s: Downloading webpage' % video_id)
538 def report_age_confirmation(self):
539 """Report attempt to confirm age."""
540 self.to_screen('Confirming age')
542 def report_login(self):
543 """Report attempt to log in."""
544 self.to_screen('Logging in')
547 def raise_login_required(msg='This video is only available for registered users'):
548 raise ExtractorError(
549 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
553 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
554 raise ExtractorError(
555 '%s. You might want to use --proxy to workaround.' % msg,
558 # Methods for following #608
560 def url_result(url, ie=None, video_id=None, video_title=None):
561 """Returns a URL that points to a page that should be processed"""
562 # TODO: ie should be the class used for getting the info
563 video_info = {'_type': 'url',
566 if video_id is not None:
567 video_info['id'] = video_id
568 if video_title is not None:
569 video_info['title'] = video_title
573 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
574 """Returns a playlist"""
575 video_info = {'_type': 'playlist',
578 video_info['id'] = playlist_id
580 video_info['title'] = playlist_title
581 if playlist_description:
582 video_info['description'] = playlist_description
585 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
587 Perform a regex search on the given string, using a single or a list of
588 patterns returning the first matching group.
589 In case of failure return a default value or raise a WARNING or a
590 RegexNotFoundError, depending on fatal, specifying the field name.
592 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
593 mobj = re.search(pattern, string, flags)
596 mobj = re.search(p, string, flags)
600 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
601 _name = '\033[0;34m%s\033[0m' % name
607 # return the first matching group
608 return next(g for g in mobj.groups() if g is not None)
610 return mobj.group(group)
611 elif default is not NO_DEFAULT:
614 raise RegexNotFoundError('Unable to extract %s' % _name)
616 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
619 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
621 Like _search_regex, but strips HTML tags and unescapes entities.
623 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
625 return clean_html(res).strip()
629 def _get_login_info(self):
631 Get the login info as (username, password)
632 It will look in the netrc file using the _NETRC_MACHINE value
633 If there's no info available, return (None, None)
635 if self._downloader is None:
640 downloader_params = self._downloader.params
642 # Attempt to use provided username and password or .netrc data
643 if downloader_params.get('username') is not None:
644 username = downloader_params['username']
645 password = downloader_params['password']
646 elif downloader_params.get('usenetrc', False):
648 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
653 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
654 except (IOError, netrc.NetrcParseError) as err:
655 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
657 return (username, password)
659 def _get_tfa_info(self, note='two-factor verification code'):
661 Get the two-factor authentication info
662 TODO - asking the user will be required for sms/phone verify
663 currently just uses the command line option
664 If there's no info available, return None
666 if self._downloader is None:
668 downloader_params = self._downloader.params
670 if downloader_params.get('twofactor') is not None:
671 return downloader_params['twofactor']
673 return compat_getpass('Type %s and press [Return]: ' % note)
675 # Helper functions for extracting OpenGraph info
677 def _og_regexes(prop):
678 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
679 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
680 % {'prop': re.escape(prop)})
681 template = r'<meta[^>]+?%s[^>]+?%s'
683 template % (property_re, content_re),
684 template % (content_re, property_re),
688 def _meta_regex(prop):
689 return r'''(?isx)<meta
690 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
691 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
693 def _og_search_property(self, prop, html, name=None, **kargs):
695 name = 'OpenGraph %s' % prop
696 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
699 return unescapeHTML(escaped)
701 def _og_search_thumbnail(self, html, **kargs):
702 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
704 def _og_search_description(self, html, **kargs):
705 return self._og_search_property('description', html, fatal=False, **kargs)
707 def _og_search_title(self, html, **kargs):
708 return self._og_search_property('title', html, **kargs)
710 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
711 regexes = self._og_regexes('video') + self._og_regexes('video:url')
713 regexes = self._og_regexes('video:secure_url') + regexes
714 return self._html_search_regex(regexes, html, name, **kargs)
716 def _og_search_url(self, html, **kargs):
717 return self._og_search_property('url', html, **kargs)
719 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
720 if display_name is None:
722 return self._html_search_regex(
723 self._meta_regex(name),
724 html, display_name, fatal=fatal, group='content', **kwargs)
726 def _dc_search_uploader(self, html):
727 return self._html_search_meta('dc.creator', html, 'uploader')
729 def _rta_search(self, html):
730 # See http://www.rtalabel.org/index.php?content=howtofaq#single
731 if re.search(r'(?ix)<meta\s+name="rating"\s+'
732 r' content="RTA-5042-1996-1400-1577-RTA"',
737 def _media_rating_search(self, html):
738 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
739 rating = self._html_search_meta('rating', html)
751 return RATING_TABLE.get(rating.lower())
753 def _family_friendly_search(self, html):
754 # See http://schema.org/VideoObject
755 family_friendly = self._html_search_meta('isFamilyFriendly', html)
757 if not family_friendly:
766 return RATING_TABLE.get(family_friendly.lower())
768 def _twitter_search_player(self, html):
769 return self._html_search_meta('twitter:player', html,
770 'twitter card player')
772 def _search_json_ld(self, html, video_id, **kwargs):
773 json_ld = self._search_regex(
774 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
775 html, 'JSON-LD', group='json_ld', **kwargs)
778 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
780 def _json_ld(self, json_ld, video_id, fatal=True):
781 if isinstance(json_ld, compat_str):
782 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
786 if json_ld.get('@context') == 'http://schema.org':
787 item_type = json_ld.get('@type')
788 if item_type == 'TVEpisode':
790 'episode': unescapeHTML(json_ld.get('name')),
791 'episode_number': int_or_none(json_ld.get('episodeNumber')),
792 'description': unescapeHTML(json_ld.get('description')),
794 part_of_season = json_ld.get('partOfSeason')
795 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
796 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
797 part_of_series = json_ld.get('partOfSeries')
798 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
799 info['series'] = unescapeHTML(part_of_series.get('name'))
800 elif item_type == 'Article':
802 'timestamp': parse_iso8601(json_ld.get('datePublished')),
803 'title': unescapeHTML(json_ld.get('headline')),
804 'description': unescapeHTML(json_ld.get('articleBody')),
806 return dict((k, v) for k, v in info.items() if v is not None)
809 def _hidden_inputs(html):
810 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
812 for input in re.findall(r'(?i)<input([^>]+)>', html):
813 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
815 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
818 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
821 hidden_inputs[name.group('value')] = value.group('value')
824 def _form_hidden_inputs(self, form_id, html):
825 form = self._search_regex(
826 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
827 html, '%s form' % form_id, group='form')
828 return self._hidden_inputs(form)
830 def _sort_formats(self, formats, field_preference=None):
832 raise ExtractorError('No video formats found')
835 # Automatically determine tbr when missing based on abr and vbr (improves
836 # formats sorting in some cases)
837 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
838 f['tbr'] = f['abr'] + f['vbr']
841 # TODO remove the following workaround
842 from ..utils import determine_ext
843 if not f.get('ext') and 'url' in f:
844 f['ext'] = determine_ext(f['url'])
846 if isinstance(field_preference, (list, tuple)):
847 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
849 preference = f.get('preference')
850 if preference is None:
852 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
855 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
857 if f.get('vcodec') == 'none': # audio only
858 if self._downloader.params.get('prefer_free_formats'):
859 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
861 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
864 audio_ext_preference = ORDER.index(f['ext'])
866 audio_ext_preference = -1
868 if self._downloader.params.get('prefer_free_formats'):
869 ORDER = ['flv', 'mp4', 'webm']
871 ORDER = ['webm', 'flv', 'mp4']
873 ext_preference = ORDER.index(f['ext'])
876 audio_ext_preference = 0
880 f.get('language_preference') if f.get('language_preference') is not None else -1,
881 f.get('quality') if f.get('quality') is not None else -1,
882 f.get('tbr') if f.get('tbr') is not None else -1,
883 f.get('filesize') if f.get('filesize') is not None else -1,
884 f.get('vbr') if f.get('vbr') is not None else -1,
885 f.get('height') if f.get('height') is not None else -1,
886 f.get('width') if f.get('width') is not None else -1,
889 f.get('abr') if f.get('abr') is not None else -1,
890 audio_ext_preference,
891 f.get('fps') if f.get('fps') is not None else -1,
892 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
893 f.get('source_preference') if f.get('source_preference') is not None else -1,
894 f.get('format_id') if f.get('format_id') is not None else '',
896 formats.sort(key=_formats_key)
898 def _check_formats(self, formats, video_id):
901 lambda f: self._is_valid_url(
903 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
907 def _remove_duplicate_formats(formats):
911 if f['url'] not in format_urls:
912 format_urls.add(f['url'])
913 unique_formats.append(f)
914 formats[:] = unique_formats
916 def _is_valid_url(self, url, video_id, item='video'):
917 url = self._proto_relative_url(url, scheme='http:')
918 # For now assume non HTTP(S) URLs always valid
919 if not (url.startswith('http://') or url.startswith('https://')):
922 self._request_webpage(url, video_id, 'Checking %s URL' % item)
924 except ExtractorError as e:
925 if isinstance(e.cause, compat_urllib_error.URLError):
927 '%s: %s URL is invalid, skipping' % (video_id, item))
931 def http_scheme(self):
932 """ Either "http:" or "https:", depending on the user's preferences """
935 if self._downloader.params.get('prefer_insecure', False)
938 def _proto_relative_url(self, url, scheme=None):
941 if url.startswith('//'):
943 scheme = self.http_scheme()
948 def _sleep(self, timeout, video_id, msg_template=None):
949 if msg_template is None:
950 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
951 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
955 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
956 transform_source=lambda s: fix_xml_ampersands(s).strip(),
958 manifest = self._download_xml(
959 manifest_url, video_id, 'Downloading f4m manifest',
960 'Unable to download f4m manifest',
961 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
962 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
963 transform_source=transform_source,
966 if manifest is False:
970 manifest_version = '1.0'
971 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
973 manifest_version = '2.0'
974 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
975 base_url = xpath_text(
976 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
977 'base URL', default=None)
979 base_url = base_url.strip()
980 for i, media_el in enumerate(media_nodes):
981 if manifest_version == '2.0':
982 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
986 media_url if media_url.startswith('http://') or media_url.startswith('https://')
987 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
988 # If media_url is itself a f4m manifest do the recursive extraction
989 # since bitrates in parent manifest (this one) and media_url manifest
990 # may differ leading to inability to resolve the format by requested
991 # bitrate in f4m downloader
992 if determine_ext(manifest_url) == 'f4m':
993 formats.extend(self._extract_f4m_formats(
994 manifest_url, video_id, preference, f4m_id, fatal=fatal))
996 tbr = int_or_none(media_el.attrib.get('bitrate'))
998 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1002 'width': int_or_none(media_el.attrib.get('width')),
1003 'height': int_or_none(media_el.attrib.get('height')),
1004 'preference': preference,
1006 self._sort_formats(formats)
1010 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1011 entry_protocol='m3u8', preference=None,
1012 m3u8_id=None, note=None, errnote=None,
1016 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1020 'preference': preference - 1 if preference else -1,
1021 'resolution': 'multiple',
1022 'format_note': 'Quality selection URL',
1025 format_url = lambda u: (
1027 if re.match(r'^https?://', u)
1028 else compat_urlparse.urljoin(m3u8_url, u))
1030 res = self._download_webpage_handle(
1032 note=note or 'Downloading m3u8 information',
1033 errnote=errnote or 'Failed to download m3u8 information',
1037 m3u8_doc, urlh = res
1038 m3u8_url = urlh.geturl()
1040 # We should try extracting formats only from master playlists [1], i.e.
1041 # playlists that describe available qualities. On the other hand media
1042 # playlists [2] should be returned as is since they contain just the media
1043 # without qualities renditions.
1044 # Fortunately, master playlist can be easily distinguished from media
1045 # playlist based on particular tags availability. As of [1, 2] master
1046 # playlist tags MUST NOT appear in a media playist and vice versa.
1047 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1048 # and MUST NOT appear in master playlist thus we can clearly detect media
1049 # playlist with this criterion.
1050 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1051 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1052 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1053 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1056 'format_id': m3u8_id,
1058 'protocol': entry_protocol,
1059 'preference': preference,
1063 kv_rex = re.compile(
1064 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1065 for line in m3u8_doc.splitlines():
1066 if line.startswith('#EXT-X-STREAM-INF:'):
1068 for m in kv_rex.finditer(line):
1070 if v.startswith('"'):
1072 last_info[m.group('key')] = v
1073 elif line.startswith('#EXT-X-MEDIA:'):
1075 for m in kv_rex.finditer(line):
1077 if v.startswith('"'):
1079 last_media[m.group('key')] = v
1080 elif line.startswith('#') or not line.strip():
1083 if last_info is None:
1084 formats.append({'url': format_url(line)})
1086 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1089 format_id.append(m3u8_id)
1090 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1091 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1093 'format_id': '-'.join(format_id),
1094 'url': format_url(line.strip()),
1097 'protocol': entry_protocol,
1098 'preference': preference,
1100 resolution = last_info.get('RESOLUTION')
1102 width_str, height_str = resolution.split('x')
1103 f['width'] = int(width_str)
1104 f['height'] = int(height_str)
1105 codecs = last_info.get('CODECS')
1107 vcodec, acodec = [None] * 2
1108 va_codecs = codecs.split(',')
1109 if len(va_codecs) == 1:
1110 # Audio only entries usually come with single codec and
1111 # no resolution. For more robustness we also check it to
1113 if not resolution and va_codecs[0].startswith('mp4a'):
1114 vcodec, acodec = 'none', va_codecs[0]
1116 vcodec = va_codecs[0]
1118 vcodec, acodec = va_codecs[:2]
1123 if last_media is not None:
1124 f['m3u8_media'] = last_media
1128 self._sort_formats(formats)
1132 def _xpath_ns(path, namespace=None):
1136 for c in path.split('/'):
1137 if not c or c == '.':
1140 out.append('{%s}%s' % (namespace, c))
1141 return '/'.join(out)
1143 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1144 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1150 namespace = self._parse_smil_namespace(smil)
1152 return self._parse_smil_formats(
1153 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1155 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1156 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1159 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1161 def _download_smil(self, smil_url, video_id, fatal=True):
1162 return self._download_xml(
1163 smil_url, video_id, 'Downloading SMIL file',
1164 'Unable to download SMIL file', fatal=fatal)
1166 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1167 namespace = self._parse_smil_namespace(smil)
1169 formats = self._parse_smil_formats(
1170 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1171 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1173 video_id = os.path.splitext(url_basename(smil_url))[0]
1177 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1178 name = meta.attrib.get('name')
1179 content = meta.attrib.get('content')
1180 if not name or not content:
1182 if not title and name == 'title':
1184 elif not description and name in ('description', 'abstract'):
1185 description = content
1186 elif not upload_date and name == 'date':
1187 upload_date = unified_strdate(content)
1190 'id': image.get('type'),
1191 'url': image.get('src'),
1192 'width': int_or_none(image.get('width')),
1193 'height': int_or_none(image.get('height')),
1194 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1198 'title': title or video_id,
1199 'description': description,
1200 'upload_date': upload_date,
1201 'thumbnails': thumbnails,
1203 'subtitles': subtitles,
1206 def _parse_smil_namespace(self, smil):
1207 return self._search_regex(
1208 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1210 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1212 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1213 b = meta.get('base') or meta.get('httpBase')
1224 videos = smil.findall(self._xpath_ns('.//video', namespace))
1225 for video in videos:
1226 src = video.get('src')
1227 if not src or src in srcs:
1231 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1232 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1233 width = int_or_none(video.get('width'))
1234 height = int_or_none(video.get('height'))
1235 proto = video.get('proto')
1236 ext = video.get('ext')
1237 src_ext = determine_ext(src)
1238 streamer = video.get('streamer') or base
1240 if proto == 'rtmp' or streamer.startswith('rtmp'):
1246 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1248 'filesize': filesize,
1252 if transform_rtmp_url:
1253 streamer, src = transform_rtmp_url(streamer, src)
1254 formats[-1].update({
1260 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1261 src_url = src_url.strip()
1263 if proto == 'm3u8' or src_ext == 'm3u8':
1264 m3u8_formats = self._extract_m3u8_formats(
1265 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1266 if len(m3u8_formats) == 1:
1268 m3u8_formats[0].update({
1269 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1274 formats.extend(m3u8_formats)
1277 if src_ext == 'f4m':
1282 'plugin': 'flowplayer-3.2.0.1',
1284 f4m_url += '&' if '?' in f4m_url else '?'
1285 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1286 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1289 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1293 'ext': ext or src_ext or 'flv',
1294 'format_id': 'http-%d' % (bitrate or http_count),
1296 'filesize': filesize,
1302 self._sort_formats(formats)
1306 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1309 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1310 src = textstream.get('src')
1311 if not src or src in urls:
1314 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1315 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1316 subtitles.setdefault(lang, []).append({
1322 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1323 xspf = self._download_xml(
1324 playlist_url, playlist_id, 'Downloading xpsf playlist',
1325 'Unable to download xspf manifest', fatal=fatal)
1328 return self._parse_xspf(xspf, playlist_id)
1330 def _parse_xspf(self, playlist, playlist_id):
1332 'xspf': 'http://xspf.org/ns/0/',
1333 's1': 'http://static.streamone.nl/player/ns/0',
1337 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1339 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1340 description = xpath_text(
1341 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1342 thumbnail = xpath_text(
1343 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1344 duration = float_or_none(
1345 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1348 'url': location.text,
1349 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1350 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1351 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1352 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1353 self._sort_formats(formats)
1358 'description': description,
1359 'thumbnail': thumbnail,
1360 'duration': duration,
1365 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1366 res = self._download_webpage_handle(
1368 note=note or 'Downloading MPD manifest',
1369 errnote=errnote or 'Failed to download MPD manifest',
1374 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1376 return self._parse_mpd_formats(
1377 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1379 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1380 if mpd_doc.get('type') == 'dynamic':
1383 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1386 return self._xpath_ns(path, namespace)
1388 def is_drm_protected(element):
1389 return element.find(_add_ns('ContentProtection')) is not None
1391 def extract_multisegment_info(element, ms_parent_info):
1392 ms_info = ms_parent_info.copy()
1393 segment_list = element.find(_add_ns('SegmentList'))
1394 if segment_list is not None:
1395 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1397 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1398 initialization = segment_list.find(_add_ns('Initialization'))
1399 if initialization is not None:
1400 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1402 segment_template = element.find(_add_ns('SegmentTemplate'))
1403 if segment_template is not None:
1404 start_number = segment_template.get('startNumber')
1406 ms_info['start_number'] = int(start_number)
1407 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1408 if segment_timeline is not None:
1409 s_e = segment_timeline.findall(_add_ns('S'))
1411 ms_info['total_number'] = 0
1413 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1415 timescale = segment_template.get('timescale')
1417 ms_info['timescale'] = int(timescale)
1418 segment_duration = segment_template.get('duration')
1419 if segment_duration:
1420 ms_info['segment_duration'] = int(segment_duration)
1421 media_template = segment_template.get('media')
1423 ms_info['media_template'] = media_template
1424 initialization = segment_template.get('initialization')
1426 ms_info['initialization_url'] = initialization
1428 initialization = segment_template.find(_add_ns('Initialization'))
1429 if initialization is not None:
1430 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1433 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1435 for period in mpd_doc.findall(_add_ns('Period')):
1436 period_duration = parse_duration(period.get('duration')) or mpd_duration
1437 period_ms_info = extract_multisegment_info(period, {
1441 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1442 if is_drm_protected(adaptation_set):
1444 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1445 for representation in adaptation_set.findall(_add_ns('Representation')):
1446 if is_drm_protected(representation):
1448 representation_attrib = adaptation_set.attrib.copy()
1449 representation_attrib.update(representation.attrib)
1450 mime_type = representation_attrib.get('mimeType')
1451 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1452 if content_type == 'text':
1453 # TODO implement WebVTT downloading
1455 elif content_type == 'video' or content_type == 'audio':
1457 for element in (representation, adaptation_set, period, mpd_doc):
1458 base_url_e = element.find(_add_ns('BaseURL'))
1459 if base_url_e is not None:
1460 base_url = base_url_e.text + base_url
1461 if re.match(r'^https?://', base_url):
1463 if mpd_base_url and not re.match(r'^https?://', base_url):
1464 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1466 base_url = mpd_base_url + base_url
1467 representation_id = representation_attrib.get('id')
1468 lang = representation_attrib.get('lang')
1469 url_el = representation.find(_add_ns('BaseURL'))
1470 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1472 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1474 'width': int_or_none(representation_attrib.get('width')),
1475 'height': int_or_none(representation_attrib.get('height')),
1476 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1477 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1478 'fps': int_or_none(representation_attrib.get('frameRate')),
1479 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1480 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1481 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1482 'format_note': 'DASH %s' % content_type,
1483 'filesize': filesize,
1485 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1486 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1487 if 'total_number' not in representation_ms_info and 'segment_duration':
1488 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1489 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1490 media_template = representation_ms_info['media_template']
1491 media_template = media_template.replace('$RepresentationID$', representation_id)
1492 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1493 media_template.replace('$$', '$')
1494 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1495 if 'segment_urls' in representation_ms_info:
1497 'segment_urls': representation_ms_info['segment_urls'],
1498 'protocol': 'http_dash_segments',
1500 if 'initialization_url' in representation_ms_info:
1501 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1503 'initialization_url': initialization_url,
1505 if not f.get('url'):
1506 f['url'] = initialization_url
1508 existing_format = next(
1509 fo for fo in formats
1510 if fo['format_id'] == representation_id)
1511 except StopIteration:
1512 full_info = formats_dict.get(representation_id, {}).copy()
1514 formats.append(full_info)
1516 existing_format.update(f)
1518 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1519 self._sort_formats(formats)
1522 def _live_title(self, name):
1523 """ Generate the title for a live video """
1524 now = datetime.datetime.now()
1525 now_str = now.strftime('%Y-%m-%d %H:%M')
1526 return name + ' ' + now_str
1528 def _int(self, v, name, fatal=False, **kwargs):
1529 res = int_or_none(v, **kwargs)
1530 if 'get_attr' in kwargs:
1531 print(getattr(v, kwargs['get_attr']))
1533 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1535 raise ExtractorError(msg)
1537 self._downloader.report_warning(msg)
1540 def _float(self, v, name, fatal=False, **kwargs):
1541 res = float_or_none(v, **kwargs)
1543 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1545 raise ExtractorError(msg)
1547 self._downloader.report_warning(msg)
1550 def _set_cookie(self, domain, name, value, expire_time=None):
1551 cookie = compat_cookiejar.Cookie(
1552 0, name, value, None, None, domain, None,
1553 None, '/', True, False, expire_time, '', None, None, None)
1554 self._downloader.cookiejar.set_cookie(cookie)
1556 def _get_cookies(self, url):
1557 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1558 req = sanitized_Request(url)
1559 self._downloader.cookiejar.add_cookie_header(req)
1560 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1562 def get_testcases(self, include_onlymatching=False):
1563 t = getattr(self, '_TEST', None)
1565 assert not hasattr(self, '_TESTS'), \
1566 '%s has _TEST and _TESTS' % type(self).__name__
1569 tests = getattr(self, '_TESTS', [])
1571 if not include_onlymatching and t.get('only_matching', False):
1573 t['name'] = type(self).__name__[:-len('IE')]
1576 def is_suitable(self, age_limit):
1577 """ Test whether the extractor is generally suitable for the given
1578 age limit (i.e. pornographic sites are not, all others usually are) """
1580 any_restricted = False
1581 for tc in self.get_testcases(include_onlymatching=False):
1582 if 'playlist' in tc:
1583 tc = tc['playlist'][0]
1584 is_restricted = age_restricted(
1585 tc.get('info_dict', {}).get('age_limit'), age_limit)
1586 if not is_restricted:
1588 any_restricted = any_restricted or is_restricted
1589 return not any_restricted
1591 def extract_subtitles(self, *args, **kwargs):
1592 if (self._downloader.params.get('writesubtitles', False) or
1593 self._downloader.params.get('listsubtitles')):
1594 return self._get_subtitles(*args, **kwargs)
1597 def _get_subtitles(self, *args, **kwargs):
1598 raise NotImplementedError('This method must be implemented by subclasses')
1601 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1602 """ Merge subtitle items for one language. Items with duplicated URLs
1603 will be dropped. """
1604 list1_urls = set([item['url'] for item in subtitle_list1])
1605 ret = list(subtitle_list1)
1606 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1610 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1611 """ Merge two subtitle dictionaries, language by language. """
1612 ret = dict(subtitle_dict1)
1613 for lang in subtitle_dict2:
1614 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1617 def extract_automatic_captions(self, *args, **kwargs):
1618 if (self._downloader.params.get('writeautomaticsub', False) or
1619 self._downloader.params.get('listsubtitles')):
1620 return self._get_automatic_captions(*args, **kwargs)
1623 def _get_automatic_captions(self, *args, **kwargs):
1624 raise NotImplementedError('This method must be implemented by subclasses')
1626 def mark_watched(self, *args, **kwargs):
1627 if (self._downloader.params.get('mark_watched', False) and
1628 (self._get_login_info()[0] is not None or
1629 self._downloader.params.get('cookiefile') is not None)):
1630 self._mark_watched(*args, **kwargs)
1632 def _mark_watched(self, *args, **kwargs):
1633 raise NotImplementedError('This method must be implemented by subclasses')
1636 class SearchInfoExtractor(InfoExtractor):
1638 Base class for paged search queries extractors.
1639 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1640 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1644 def _make_valid_url(cls):
1645 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1648 def suitable(cls, url):
1649 return re.match(cls._make_valid_url(), url) is not None
1651 def _real_extract(self, query):
1652 mobj = re.match(self._make_valid_url(), query)
1654 raise ExtractorError('Invalid search query "%s"' % query)
1656 prefix = mobj.group('prefix')
1657 query = mobj.group('query')
1659 return self._get_n_results(query, 1)
1660 elif prefix == 'all':
1661 return self._get_n_results(query, self._MAX_RESULTS)
1665 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1666 elif n > self._MAX_RESULTS:
1667 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1668 n = self._MAX_RESULTS
1669 return self._get_n_results(query, n)
1671 def _get_n_results(self, query, n):
1672 """Get a specified number of results for a query"""
1673 raise NotImplementedError('This method must be implemented by subclasses')
1676 def SEARCH_KEY(self):
1677 return self._SEARCH_KEY