1 from __future__ import unicode_literals
15 from ..compat import (
24 compat_etree_fromstring,
53 class InfoExtractor(object):
54 """Information Extractor class.
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
60 passed to the YoutubeDL. The YoutubeDL processes this
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
64 The type field determines the type of the result.
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
68 For a video, the dictionaries must include the following fields:
71 title: Video title, unescaped.
73 Additionally, it must contain either a formats entry or a url one:
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
79 * url Mandatory. The URL of the video file
80 * ext Will be calculated from URL if missing
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
92 * resolution Textual description of width and height
93 * tbr Average bitrate of audio and video in KBit/s
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
96 * asr Audio sampling rate in Hertz
97 * vbr Average video bitrate in KBit/s
99 * vcodec Name of the video codec in use
100 * container Name of the container format
101 * filesize The number of bytes, if known in advance
102 * filesize_approx An estimate for the number of bytes
103 * player_url SWF Player URL (used for rtmpdump).
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
108 * preference Order number of this format. If this field is
109 present and not None, the formats get sorted
110 by this field, regardless of all other values.
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
131 * stretched_ratio If given and not 1, indicates that the
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
137 url: Final video URL.
138 ext: Video filename extension.
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
142 The following fields are optional:
144 alt_title: A secondary title of the video.
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
149 thumbnails: A list of dictionaries, with the following entries:
150 * "id" (optional, string) - Thumbnail format ID
152 * "preference" (optional, int) - quality of the image
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
157 thumbnail: Full URL to a video thumbnail image.
158 description: Full video description.
159 uploader: Full name of the video uploader.
160 creator: The main artist who created the video.
161 release_date: The date (YYYYMMDD) when the video was released.
162 timestamp: UNIX timestamp of the moment the video became available.
163 upload_date: Video upload date (YYYYMMDD).
164 If not explicitly set, calculated from timestamp.
165 uploader_id: Nickname or id of the video uploader.
166 location: Physical location where the video was filmed.
167 subtitles: The available subtitles as a dictionary in the format
168 {language: subformats}. "subformats" is a list sorted from
169 lower to higher preference, each element is a dictionary
170 with the "ext" entry and one of:
171 * "data": The subtitles file contents
172 * "url": A URL pointing to the subtitles file
173 "ext" will be calculated from URL if missing
174 automatic_captions: Like 'subtitles', used by the YoutubeIE for
175 automatically generated captions
176 duration: Length of the video in seconds, as an integer or float.
177 view_count: How many users have watched the video on the platform.
178 like_count: Number of positive ratings of the video
179 dislike_count: Number of negative ratings of the video
180 repost_count: Number of reposts of the video
181 average_rating: Average rating give by users, the scale used depends on the webpage
182 comment_count: Number of comments on the video
183 comments: A list of comments, each with one or more of the following
184 properties (all but one of text or html optional):
185 * "author" - human-readable name of the comment author
186 * "author_id" - user ID of the comment author
188 * "html" - Comment as HTML
189 * "text" - Plain text of the comment
190 * "timestamp" - UNIX timestamp of comment
191 * "parent" - ID of the comment this one is replying to.
192 Set to "root" to indicate that this is a
193 comment to the original video.
194 age_limit: Age restriction for the video, as an integer (years)
195 webpage_url: The URL to the video webpage, if given to youtube-dl it
196 should allow to get the same result again. (It will be set
197 by YoutubeDL if it's missing)
198 categories: A list of categories that the video falls in, for example
200 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
201 is_live: True, False, or None (=unknown). Whether this video is a
202 live stream that goes on instead of a fixed-length video.
203 start_time: Time in seconds where the reproduction should start, as
204 specified in the URL.
205 end_time: Time in seconds where the reproduction should end, as
206 specified in the URL.
208 The following fields should only be used when the video belongs to some logical
211 chapter: Name or title of the chapter the video belongs to.
212 chapter_number: Number of the chapter the video belongs to, as an integer.
213 chapter_id: Id of the chapter the video belongs to, as a unicode string.
215 The following fields should only be used when the video is an episode of some
218 series: Title of the series or programme the video episode belongs to.
219 season: Title of the season the video episode belongs to.
220 season_number: Number of the season the video episode belongs to, as an integer.
221 season_id: Id of the season the video episode belongs to, as a unicode string.
222 episode: Title of the video episode. Unlike mandatory video title field,
223 this field should denote the exact title of the video episode
224 without any kind of decoration.
225 episode_number: Number of the video episode within a season, as an integer.
226 episode_id: Id of the video episode, as a unicode string.
228 Unless mentioned otherwise, the fields should be Unicode strings.
230 Unless mentioned otherwise, None is equivalent to absence of information.
233 _type "playlist" indicates multiple videos.
234 There must be a key "entries", which is a list, an iterable, or a PagedList
235 object, each element of which is a valid dictionary by this specification.
237 Additionally, playlists can have "title", "description" and "id" attributes
238 with the same semantics as videos (see above).
241 _type "multi_video" indicates that there are multiple videos that
242 form a single show, for examples multiple acts of an opera or TV episode.
243 It must have an entries key like a playlist and contain all the keys
244 required for a video at the same time.
247 _type "url" indicates that the video must be extracted from another
248 location, possibly by a different extractor. Its only required key is:
249 "url" - the next URL to extract.
250 The key "ie_key" can be set to the class name (minus the trailing "IE",
251 e.g. "Youtube") if the extractor class is known in advance.
252 Additionally, the dictionary may have any properties of the resolved entity
253 known in advance, for example "title" if the title of the referred video is
257 _type "url_transparent" entities have the same specification as "url", but
258 indicate that the given additional information is more precise than the one
259 associated with the resolved URL.
260 This is useful when a site employs a video service that hosts the video and
261 its technical metadata, but that video service does not embed a useful
262 title, description etc.
265 Subclasses of this one should re-define the _real_initialize() and
266 _real_extract() methods and define a _VALID_URL regexp.
267 Probably, they should also be added to the list of extractors.
269 Finally, the _WORKING attribute should be set to False for broken IEs
270 in order to warn the users and skip the tests.
277 def __init__(self, downloader=None):
278 """Constructor. Receives an optional downloader."""
280 self.set_downloader(downloader)
283 def suitable(cls, url):
284 """Receives a URL and returns True if suitable for this IE."""
286 # This does not use has/getattr intentionally - we want to know whether
287 # we have cached the regexp for *this* class, whereas getattr would also
288 # match the superclass
289 if '_VALID_URL_RE' not in cls.__dict__:
290 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
291 return cls._VALID_URL_RE.match(url) is not None
294 def _match_id(cls, url):
295 if '_VALID_URL_RE' not in cls.__dict__:
296 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
297 m = cls._VALID_URL_RE.match(url)
303 """Getter method for _WORKING."""
306 def initialize(self):
307 """Initializes an instance (authentication, etc)."""
309 self._real_initialize()
312 def extract(self, url):
313 """Extracts URL information and returns it in list of dicts."""
316 return self._real_extract(url)
317 except ExtractorError:
319 except compat_http_client.IncompleteRead as e:
320 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
321 except (KeyError, StopIteration) as e:
322 raise ExtractorError('An extractor error has occurred.', cause=e)
324 def set_downloader(self, downloader):
325 """Sets the downloader for this IE."""
326 self._downloader = downloader
328 def _real_initialize(self):
329 """Real initialization process. Redefine in subclasses."""
332 def _real_extract(self, url):
333 """Real extraction process. Redefine in subclasses."""
338 """A string for getting the InfoExtractor with get_info_extractor"""
339 return compat_str(cls.__name__[:-2])
343 return compat_str(type(self).__name__[:-2])
345 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
346 """ Returns the response handle """
348 self.report_download_webpage(video_id)
349 elif note is not False:
351 self.to_screen('%s' % (note,))
353 self.to_screen('%s: %s' % (video_id, note))
355 return self._downloader.urlopen(url_or_request)
356 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
360 errnote = 'Unable to download webpage'
362 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
364 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
366 self._downloader.report_warning(errmsg)
369 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
370 """ Returns a tuple (page content as string, URL handle) """
371 # Strip hashes from the URL (#1038)
372 if isinstance(url_or_request, (compat_str, str)):
373 url_or_request = url_or_request.partition('#')[0]
375 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
379 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
380 return (content, urlh)
383 def _guess_encoding_from_content(content_type, webpage_bytes):
384 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
386 encoding = m.group(1)
388 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
389 webpage_bytes[:1024])
391 encoding = m.group(1).decode('ascii')
392 elif webpage_bytes.startswith(b'\xff\xfe'):
399 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
400 content_type = urlh.headers.get('Content-Type', '')
401 webpage_bytes = urlh.read()
402 if prefix is not None:
403 webpage_bytes = prefix + webpage_bytes
405 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
406 if self._downloader.params.get('dump_intermediate_pages', False):
408 url = url_or_request.get_full_url()
409 except AttributeError:
411 self.to_screen('Dumping request to ' + url)
412 dump = base64.b64encode(webpage_bytes).decode('ascii')
413 self._downloader.to_screen(dump)
414 if self._downloader.params.get('write_pages', False):
416 url = url_or_request.get_full_url()
417 except AttributeError:
419 basen = '%s_%s' % (video_id, url)
421 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
422 basen = basen[:240 - len(h)] + h
423 raw_filename = basen + '.dump'
424 filename = sanitize_filename(raw_filename, restricted=True)
425 self.to_screen('Saving request to ' + filename)
426 # Working around MAX_PATH limitation on Windows (see
427 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
429 absfilepath = os.path.abspath(filename)
430 if len(absfilepath) > 259:
431 filename = '\\\\?\\' + absfilepath
432 with open(filename, 'wb') as outf:
433 outf.write(webpage_bytes)
436 content = webpage_bytes.decode(encoding, 'replace')
438 content = webpage_bytes.decode('utf-8', 'replace')
440 if ('<title>Access to this site is blocked</title>' in content and
441 'Websense' in content[:512]):
442 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
443 blocked_iframe = self._html_search_regex(
444 r'<iframe src="([^"]+)"', content,
445 'Websense information URL', default=None)
447 msg += ' Visit %s for more details' % blocked_iframe
448 raise ExtractorError(msg, expected=True)
449 if '<title>The URL you requested has been blocked</title>' in content[:512]:
451 'Access to this webpage has been blocked by Indian censorship. '
452 'Use a VPN or proxy server (with --proxy) to route around it.')
453 block_msg = self._html_search_regex(
454 r'</h1><p>(.*?)</p>',
455 content, 'block message', default=None)
457 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
458 raise ExtractorError(msg, expected=True)
462 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
463 """ Returns the data of the page as a string """
466 while success is False:
468 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
470 except compat_http_client.IncompleteRead as e:
472 if try_count >= tries:
474 self._sleep(timeout, video_id)
481 def _download_xml(self, url_or_request, video_id,
482 note='Downloading XML', errnote='Unable to download XML',
483 transform_source=None, fatal=True, encoding=None):
484 """Return the xml as an xml.etree.ElementTree.Element"""
485 xml_string = self._download_webpage(
486 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
487 if xml_string is False:
490 xml_string = transform_source(xml_string)
491 return compat_etree_fromstring(xml_string.encode('utf-8'))
493 def _download_json(self, url_or_request, video_id,
494 note='Downloading JSON metadata',
495 errnote='Unable to download JSON metadata',
496 transform_source=None,
497 fatal=True, encoding=None):
498 json_string = self._download_webpage(
499 url_or_request, video_id, note, errnote, fatal=fatal,
501 if (not fatal) and json_string is False:
503 return self._parse_json(
504 json_string, video_id, transform_source=transform_source, fatal=fatal)
506 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
508 json_string = transform_source(json_string)
510 return json.loads(json_string)
511 except ValueError as ve:
512 errmsg = '%s: Failed to parse JSON ' % video_id
514 raise ExtractorError(errmsg, cause=ve)
516 self.report_warning(errmsg + str(ve))
518 def report_warning(self, msg, video_id=None):
519 idstr = '' if video_id is None else '%s: ' % video_id
520 self._downloader.report_warning(
521 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
523 def to_screen(self, msg):
524 """Print msg to screen, prefixing it with '[ie_name]'"""
525 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
527 def report_extraction(self, id_or_name):
528 """Report information extraction."""
529 self.to_screen('%s: Extracting information' % id_or_name)
531 def report_download_webpage(self, video_id):
532 """Report webpage download."""
533 self.to_screen('%s: Downloading webpage' % video_id)
535 def report_age_confirmation(self):
536 """Report attempt to confirm age."""
537 self.to_screen('Confirming age')
539 def report_login(self):
540 """Report attempt to log in."""
541 self.to_screen('Logging in')
544 def raise_login_required(msg='This video is only available for registered users'):
545 raise ExtractorError(
546 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
550 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
551 raise ExtractorError(
552 '%s. You might want to use --proxy to workaround.' % msg,
555 # Methods for following #608
557 def url_result(url, ie=None, video_id=None, video_title=None):
558 """Returns a URL that points to a page that should be processed"""
559 # TODO: ie should be the class used for getting the info
560 video_info = {'_type': 'url',
563 if video_id is not None:
564 video_info['id'] = video_id
565 if video_title is not None:
566 video_info['title'] = video_title
570 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
571 """Returns a playlist"""
572 video_info = {'_type': 'playlist',
575 video_info['id'] = playlist_id
577 video_info['title'] = playlist_title
578 if playlist_description:
579 video_info['description'] = playlist_description
582 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
584 Perform a regex search on the given string, using a single or a list of
585 patterns returning the first matching group.
586 In case of failure return a default value or raise a WARNING or a
587 RegexNotFoundError, depending on fatal, specifying the field name.
589 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
590 mobj = re.search(pattern, string, flags)
593 mobj = re.search(p, string, flags)
597 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
598 _name = '\033[0;34m%s\033[0m' % name
604 # return the first matching group
605 return next(g for g in mobj.groups() if g is not None)
607 return mobj.group(group)
608 elif default is not NO_DEFAULT:
611 raise RegexNotFoundError('Unable to extract %s' % _name)
613 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
616 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
618 Like _search_regex, but strips HTML tags and unescapes entities.
620 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
622 return clean_html(res).strip()
626 def _get_login_info(self):
628 Get the login info as (username, password)
629 It will look in the netrc file using the _NETRC_MACHINE value
630 If there's no info available, return (None, None)
632 if self._downloader is None:
637 downloader_params = self._downloader.params
639 # Attempt to use provided username and password or .netrc data
640 if downloader_params.get('username') is not None:
641 username = downloader_params['username']
642 password = downloader_params['password']
643 elif downloader_params.get('usenetrc', False):
645 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
650 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
651 except (IOError, netrc.NetrcParseError) as err:
652 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
654 return (username, password)
656 def _get_tfa_info(self, note='two-factor verification code'):
658 Get the two-factor authentication info
659 TODO - asking the user will be required for sms/phone verify
660 currently just uses the command line option
661 If there's no info available, return None
663 if self._downloader is None:
665 downloader_params = self._downloader.params
667 if downloader_params.get('twofactor') is not None:
668 return downloader_params['twofactor']
670 return compat_getpass('Type %s and press [Return]: ' % note)
672 # Helper functions for extracting OpenGraph info
674 def _og_regexes(prop):
675 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
676 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
677 % {'prop': re.escape(prop)})
678 template = r'<meta[^>]+?%s[^>]+?%s'
680 template % (property_re, content_re),
681 template % (content_re, property_re),
685 def _meta_regex(prop):
686 return r'''(?isx)<meta
687 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
688 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
690 def _og_search_property(self, prop, html, name=None, **kargs):
692 name = 'OpenGraph %s' % prop
693 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
696 return unescapeHTML(escaped)
698 def _og_search_thumbnail(self, html, **kargs):
699 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
701 def _og_search_description(self, html, **kargs):
702 return self._og_search_property('description', html, fatal=False, **kargs)
704 def _og_search_title(self, html, **kargs):
705 return self._og_search_property('title', html, **kargs)
707 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
708 regexes = self._og_regexes('video') + self._og_regexes('video:url')
710 regexes = self._og_regexes('video:secure_url') + regexes
711 return self._html_search_regex(regexes, html, name, **kargs)
713 def _og_search_url(self, html, **kargs):
714 return self._og_search_property('url', html, **kargs)
716 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
717 if display_name is None:
719 return self._html_search_regex(
720 self._meta_regex(name),
721 html, display_name, fatal=fatal, group='content', **kwargs)
723 def _dc_search_uploader(self, html):
724 return self._html_search_meta('dc.creator', html, 'uploader')
726 def _rta_search(self, html):
727 # See http://www.rtalabel.org/index.php?content=howtofaq#single
728 if re.search(r'(?ix)<meta\s+name="rating"\s+'
729 r' content="RTA-5042-1996-1400-1577-RTA"',
734 def _media_rating_search(self, html):
735 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
736 rating = self._html_search_meta('rating', html)
748 return RATING_TABLE.get(rating.lower())
750 def _family_friendly_search(self, html):
751 # See http://schema.org/VideoObject
752 family_friendly = self._html_search_meta('isFamilyFriendly', html)
754 if not family_friendly:
763 return RATING_TABLE.get(family_friendly.lower())
765 def _twitter_search_player(self, html):
766 return self._html_search_meta('twitter:player', html,
767 'twitter card player')
769 def _search_json_ld(self, html, video_id, **kwargs):
770 json_ld = self._search_regex(
771 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
772 html, 'JSON-LD', group='json_ld', **kwargs)
775 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
777 def _json_ld(self, json_ld, video_id, fatal=True):
778 if isinstance(json_ld, compat_str):
779 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
783 if json_ld.get('@context') == 'http://schema.org':
784 item_type = json_ld.get('@type')
785 if item_type == 'TVEpisode':
787 'episode': unescapeHTML(json_ld.get('name')),
788 'episode_number': int_or_none(json_ld.get('episodeNumber')),
789 'description': unescapeHTML(json_ld.get('description')),
791 part_of_season = json_ld.get('partOfSeason')
792 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
793 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
794 part_of_series = json_ld.get('partOfSeries')
795 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
796 info['series'] = unescapeHTML(part_of_series.get('name'))
797 elif item_type == 'Article':
799 'timestamp': parse_iso8601(json_ld.get('datePublished')),
800 'title': unescapeHTML(json_ld.get('headline')),
801 'description': unescapeHTML(json_ld.get('articleBody')),
803 return dict((k, v) for k, v in info.items() if v is not None)
806 def _hidden_inputs(html):
807 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
809 for input in re.findall(r'(?i)<input([^>]+)>', html):
810 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
812 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
815 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
818 hidden_inputs[name.group('value')] = value.group('value')
821 def _form_hidden_inputs(self, form_id, html):
822 form = self._search_regex(
823 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
824 html, '%s form' % form_id, group='form')
825 return self._hidden_inputs(form)
827 def _sort_formats(self, formats, field_preference=None):
829 raise ExtractorError('No video formats found')
832 # Automatically determine tbr when missing based on abr and vbr (improves
833 # formats sorting in some cases)
834 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
835 f['tbr'] = f['abr'] + f['vbr']
838 # TODO remove the following workaround
839 from ..utils import determine_ext
840 if not f.get('ext') and 'url' in f:
841 f['ext'] = determine_ext(f['url'])
843 if isinstance(field_preference, (list, tuple)):
844 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
846 preference = f.get('preference')
847 if preference is None:
849 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
852 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
854 if f.get('vcodec') == 'none': # audio only
855 if self._downloader.params.get('prefer_free_formats'):
856 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
858 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
861 audio_ext_preference = ORDER.index(f['ext'])
863 audio_ext_preference = -1
865 if self._downloader.params.get('prefer_free_formats'):
866 ORDER = ['flv', 'mp4', 'webm']
868 ORDER = ['webm', 'flv', 'mp4']
870 ext_preference = ORDER.index(f['ext'])
873 audio_ext_preference = 0
877 f.get('language_preference') if f.get('language_preference') is not None else -1,
878 f.get('quality') if f.get('quality') is not None else -1,
879 f.get('tbr') if f.get('tbr') is not None else -1,
880 f.get('filesize') if f.get('filesize') is not None else -1,
881 f.get('vbr') if f.get('vbr') is not None else -1,
882 f.get('height') if f.get('height') is not None else -1,
883 f.get('width') if f.get('width') is not None else -1,
886 f.get('abr') if f.get('abr') is not None else -1,
887 audio_ext_preference,
888 f.get('fps') if f.get('fps') is not None else -1,
889 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
890 f.get('source_preference') if f.get('source_preference') is not None else -1,
891 f.get('format_id') if f.get('format_id') is not None else '',
893 formats.sort(key=_formats_key)
895 def _check_formats(self, formats, video_id):
898 lambda f: self._is_valid_url(
900 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
904 def _remove_duplicate_formats(formats):
908 if f['url'] not in format_urls:
909 format_urls.add(f['url'])
910 unique_formats.append(f)
911 formats[:] = unique_formats
913 def _is_valid_url(self, url, video_id, item='video'):
914 url = self._proto_relative_url(url, scheme='http:')
915 # For now assume non HTTP(S) URLs always valid
916 if not (url.startswith('http://') or url.startswith('https://')):
919 self._request_webpage(url, video_id, 'Checking %s URL' % item)
921 except ExtractorError as e:
922 if isinstance(e.cause, compat_urllib_error.URLError):
924 '%s: %s URL is invalid, skipping' % (video_id, item))
928 def http_scheme(self):
929 """ Either "http:" or "https:", depending on the user's preferences """
932 if self._downloader.params.get('prefer_insecure', False)
935 def _proto_relative_url(self, url, scheme=None):
938 if url.startswith('//'):
940 scheme = self.http_scheme()
945 def _sleep(self, timeout, video_id, msg_template=None):
946 if msg_template is None:
947 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
948 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
952 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
953 transform_source=lambda s: fix_xml_ampersands(s).strip(),
955 manifest = self._download_xml(
956 manifest_url, video_id, 'Downloading f4m manifest',
957 'Unable to download f4m manifest',
958 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
959 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
960 transform_source=transform_source,
963 if manifest is False:
967 manifest_version = '1.0'
968 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
970 manifest_version = '2.0'
971 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
972 base_url = xpath_text(
973 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
974 'base URL', default=None)
976 base_url = base_url.strip()
977 for i, media_el in enumerate(media_nodes):
978 if manifest_version == '2.0':
979 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
983 media_url if media_url.startswith('http://') or media_url.startswith('https://')
984 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
985 # If media_url is itself a f4m manifest do the recursive extraction
986 # since bitrates in parent manifest (this one) and media_url manifest
987 # may differ leading to inability to resolve the format by requested
988 # bitrate in f4m downloader
989 if determine_ext(manifest_url) == 'f4m':
990 formats.extend(self._extract_f4m_formats(
991 manifest_url, video_id, preference, f4m_id, fatal=fatal))
993 tbr = int_or_none(media_el.attrib.get('bitrate'))
995 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
999 'width': int_or_none(media_el.attrib.get('width')),
1000 'height': int_or_none(media_el.attrib.get('height')),
1001 'preference': preference,
1003 self._sort_formats(formats)
1007 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1008 entry_protocol='m3u8', preference=None,
1009 m3u8_id=None, note=None, errnote=None,
1013 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1017 'preference': preference - 1 if preference else -1,
1018 'resolution': 'multiple',
1019 'format_note': 'Quality selection URL',
1022 format_url = lambda u: (
1024 if re.match(r'^https?://', u)
1025 else compat_urlparse.urljoin(m3u8_url, u))
1027 res = self._download_webpage_handle(
1029 note=note or 'Downloading m3u8 information',
1030 errnote=errnote or 'Failed to download m3u8 information',
1034 m3u8_doc, urlh = res
1035 m3u8_url = urlh.geturl()
1037 # We should try extracting formats only from master playlists [1], i.e.
1038 # playlists that describe available qualities. On the other hand media
1039 # playlists [2] should be returned as is since they contain just the media
1040 # without qualities renditions.
1041 # Fortunately, master playlist can be easily distinguished from media
1042 # playlist based on particular tags availability. As of [1, 2] master
1043 # playlist tags MUST NOT appear in a media playist and vice versa.
1044 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1045 # and MUST NOT appear in master playlist thus we can clearly detect media
1046 # playlist with this criterion.
1047 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1048 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1049 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1050 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1053 'format_id': m3u8_id,
1055 'protocol': entry_protocol,
1056 'preference': preference,
1060 kv_rex = re.compile(
1061 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1062 for line in m3u8_doc.splitlines():
1063 if line.startswith('#EXT-X-STREAM-INF:'):
1065 for m in kv_rex.finditer(line):
1067 if v.startswith('"'):
1069 last_info[m.group('key')] = v
1070 elif line.startswith('#EXT-X-MEDIA:'):
1072 for m in kv_rex.finditer(line):
1074 if v.startswith('"'):
1076 last_media[m.group('key')] = v
1077 elif line.startswith('#') or not line.strip():
1080 if last_info is None:
1081 formats.append({'url': format_url(line)})
1083 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1086 format_id.append(m3u8_id)
1087 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1088 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1090 'format_id': '-'.join(format_id),
1091 'url': format_url(line.strip()),
1094 'protocol': entry_protocol,
1095 'preference': preference,
1097 resolution = last_info.get('RESOLUTION')
1099 width_str, height_str = resolution.split('x')
1100 f['width'] = int(width_str)
1101 f['height'] = int(height_str)
1102 codecs = last_info.get('CODECS')
1104 vcodec, acodec = [None] * 2
1105 va_codecs = codecs.split(',')
1106 if len(va_codecs) == 1:
1107 # Audio only entries usually come with single codec and
1108 # no resolution. For more robustness we also check it to
1110 if not resolution and va_codecs[0].startswith('mp4a'):
1111 vcodec, acodec = 'none', va_codecs[0]
1113 vcodec = va_codecs[0]
1115 vcodec, acodec = va_codecs[:2]
1120 if last_media is not None:
1121 f['m3u8_media'] = last_media
1125 self._sort_formats(formats)
1129 def _xpath_ns(path, namespace=None):
1133 for c in path.split('/'):
1134 if not c or c == '.':
1137 out.append('{%s}%s' % (namespace, c))
1138 return '/'.join(out)
1140 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1141 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1147 namespace = self._parse_smil_namespace(smil)
1149 return self._parse_smil_formats(
1150 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1152 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1153 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1156 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1158 def _download_smil(self, smil_url, video_id, fatal=True):
1159 return self._download_xml(
1160 smil_url, video_id, 'Downloading SMIL file',
1161 'Unable to download SMIL file', fatal=fatal)
1163 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1164 namespace = self._parse_smil_namespace(smil)
1166 formats = self._parse_smil_formats(
1167 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1168 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1170 video_id = os.path.splitext(url_basename(smil_url))[0]
1174 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1175 name = meta.attrib.get('name')
1176 content = meta.attrib.get('content')
1177 if not name or not content:
1179 if not title and name == 'title':
1181 elif not description and name in ('description', 'abstract'):
1182 description = content
1183 elif not upload_date and name == 'date':
1184 upload_date = unified_strdate(content)
1187 'id': image.get('type'),
1188 'url': image.get('src'),
1189 'width': int_or_none(image.get('width')),
1190 'height': int_or_none(image.get('height')),
1191 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1195 'title': title or video_id,
1196 'description': description,
1197 'upload_date': upload_date,
1198 'thumbnails': thumbnails,
1200 'subtitles': subtitles,
1203 def _parse_smil_namespace(self, smil):
1204 return self._search_regex(
1205 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1207 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1209 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1210 b = meta.get('base') or meta.get('httpBase')
1221 videos = smil.findall(self._xpath_ns('.//video', namespace))
1222 for video in videos:
1223 src = video.get('src')
1224 if not src or src in srcs:
1228 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1229 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1230 width = int_or_none(video.get('width'))
1231 height = int_or_none(video.get('height'))
1232 proto = video.get('proto')
1233 ext = video.get('ext')
1234 src_ext = determine_ext(src)
1235 streamer = video.get('streamer') or base
1237 if proto == 'rtmp' or streamer.startswith('rtmp'):
1243 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1245 'filesize': filesize,
1249 if transform_rtmp_url:
1250 streamer, src = transform_rtmp_url(streamer, src)
1251 formats[-1].update({
1257 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1258 src_url = src_url.strip()
1260 if proto == 'm3u8' or src_ext == 'm3u8':
1261 m3u8_formats = self._extract_m3u8_formats(
1262 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1263 if len(m3u8_formats) == 1:
1265 m3u8_formats[0].update({
1266 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1271 formats.extend(m3u8_formats)
1274 if src_ext == 'f4m':
1279 'plugin': 'flowplayer-3.2.0.1',
1281 f4m_url += '&' if '?' in f4m_url else '?'
1282 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1283 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1286 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1290 'ext': ext or src_ext or 'flv',
1291 'format_id': 'http-%d' % (bitrate or http_count),
1293 'filesize': filesize,
1299 self._sort_formats(formats)
1303 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1306 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1307 src = textstream.get('src')
1308 if not src or src in urls:
1311 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1312 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1313 subtitles.setdefault(lang, []).append({
1319 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1320 xspf = self._download_xml(
1321 playlist_url, playlist_id, 'Downloading xpsf playlist',
1322 'Unable to download xspf manifest', fatal=fatal)
1325 return self._parse_xspf(xspf, playlist_id)
1327 def _parse_xspf(self, playlist, playlist_id):
1329 'xspf': 'http://xspf.org/ns/0/',
1330 's1': 'http://static.streamone.nl/player/ns/0',
1334 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1336 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1337 description = xpath_text(
1338 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1339 thumbnail = xpath_text(
1340 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1341 duration = float_or_none(
1342 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1345 'url': location.text,
1346 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1347 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1348 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1349 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1350 self._sort_formats(formats)
1355 'description': description,
1356 'thumbnail': thumbnail,
1357 'duration': duration,
1362 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1363 res = self._download_webpage_handle(
1365 note=note or 'Downloading MPD manifest',
1366 errnote=errnote or 'Failed to download MPD manifest',
1371 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1373 return self._parse_mpd_formats(
1374 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1376 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1377 if mpd_doc.get('type') == 'dynamic':
1380 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1383 return self._xpath_ns(path, namespace)
1385 def is_drm_protected(element):
1386 return element.find(_add_ns('ContentProtection')) is not None
1388 def extract_multisegment_info(element, ms_parent_info):
1389 ms_info = ms_parent_info.copy()
1390 segment_list = element.find(_add_ns('SegmentList'))
1391 if segment_list is not None:
1392 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1394 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1395 initialization = segment_list.find(_add_ns('Initialization'))
1396 if initialization is not None:
1397 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1399 segment_template = element.find(_add_ns('SegmentTemplate'))
1400 if segment_template is not None:
1401 start_number = segment_template.get('startNumber')
1403 ms_info['start_number'] = int(start_number)
1404 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1405 if segment_timeline is not None:
1406 s_e = segment_timeline.findall(_add_ns('S'))
1408 ms_info['total_number'] = 0
1410 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1412 timescale = segment_template.get('timescale')
1414 ms_info['timescale'] = int(timescale)
1415 segment_duration = segment_template.get('duration')
1416 if segment_duration:
1417 ms_info['segment_duration'] = int(segment_duration)
1418 media_template = segment_template.get('media')
1420 ms_info['media_template'] = media_template
1421 initialization = segment_template.get('initialization')
1423 ms_info['initialization_url'] = initialization
1425 initialization = segment_template.find(_add_ns('Initialization'))
1426 if initialization is not None:
1427 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1430 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1432 for period in mpd_doc.findall(_add_ns('Period')):
1433 period_duration = parse_duration(period.get('duration')) or mpd_duration
1434 period_ms_info = extract_multisegment_info(period, {
1438 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1439 if is_drm_protected(adaptation_set):
1441 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1442 for representation in adaptation_set.findall(_add_ns('Representation')):
1443 if is_drm_protected(representation):
1445 representation_attrib = adaptation_set.attrib.copy()
1446 representation_attrib.update(representation.attrib)
1447 mime_type = representation_attrib.get('mimeType')
1448 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1449 if content_type == 'text':
1450 # TODO implement WebVTT downloading
1452 elif content_type == 'video' or content_type == 'audio':
1454 for element in (representation, adaptation_set, period, mpd_doc):
1455 base_url_e = element.find(_add_ns('BaseURL'))
1456 if base_url_e is not None:
1457 base_url = base_url_e.text + base_url
1458 if re.match(r'^https?://', base_url):
1460 if mpd_base_url and not re.match(r'^https?://', base_url):
1461 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1463 base_url = mpd_base_url + base_url
1464 representation_id = representation_attrib.get('id')
1465 lang = representation_attrib.get('lang')
1466 url_el = representation.find(_add_ns('BaseURL'))
1467 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1469 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1471 'width': int_or_none(representation_attrib.get('width')),
1472 'height': int_or_none(representation_attrib.get('height')),
1473 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1474 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1475 'fps': int_or_none(representation_attrib.get('frameRate')),
1476 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1477 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1478 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1479 'format_note': 'DASH %s' % content_type,
1480 'filesize': filesize,
1482 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1483 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1484 if 'total_number' not in representation_ms_info and 'segment_duration':
1485 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1486 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1487 media_template = representation_ms_info['media_template']
1488 media_template = media_template.replace('$RepresentationID$', representation_id)
1489 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1490 media_template.replace('$$', '$')
1491 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1492 if 'segment_urls' in representation_ms_info:
1494 'segment_urls': representation_ms_info['segment_urls'],
1495 'protocol': 'http_dash_segments',
1497 if 'initialization_url' in representation_ms_info:
1498 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1500 'initialization_url': initialization_url,
1502 if not f.get('url'):
1503 f['url'] = initialization_url
1505 existing_format = next(
1506 fo for fo in formats
1507 if fo['format_id'] == representation_id)
1508 except StopIteration:
1509 full_info = formats_dict.get(representation_id, {}).copy()
1511 formats.append(full_info)
1513 existing_format.update(f)
1515 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1516 self._sort_formats(formats)
1519 def _live_title(self, name):
1520 """ Generate the title for a live video """
1521 now = datetime.datetime.now()
1522 now_str = now.strftime('%Y-%m-%d %H:%M')
1523 return name + ' ' + now_str
1525 def _int(self, v, name, fatal=False, **kwargs):
1526 res = int_or_none(v, **kwargs)
1527 if 'get_attr' in kwargs:
1528 print(getattr(v, kwargs['get_attr']))
1530 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1532 raise ExtractorError(msg)
1534 self._downloader.report_warning(msg)
1537 def _float(self, v, name, fatal=False, **kwargs):
1538 res = float_or_none(v, **kwargs)
1540 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1542 raise ExtractorError(msg)
1544 self._downloader.report_warning(msg)
1547 def _set_cookie(self, domain, name, value, expire_time=None):
1548 cookie = compat_cookiejar.Cookie(
1549 0, name, value, None, None, domain, None,
1550 None, '/', True, False, expire_time, '', None, None, None)
1551 self._downloader.cookiejar.set_cookie(cookie)
1553 def _get_cookies(self, url):
1554 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1555 req = sanitized_Request(url)
1556 self._downloader.cookiejar.add_cookie_header(req)
1557 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1559 def get_testcases(self, include_onlymatching=False):
1560 t = getattr(self, '_TEST', None)
1562 assert not hasattr(self, '_TESTS'), \
1563 '%s has _TEST and _TESTS' % type(self).__name__
1566 tests = getattr(self, '_TESTS', [])
1568 if not include_onlymatching and t.get('only_matching', False):
1570 t['name'] = type(self).__name__[:-len('IE')]
1573 def is_suitable(self, age_limit):
1574 """ Test whether the extractor is generally suitable for the given
1575 age limit (i.e. pornographic sites are not, all others usually are) """
1577 any_restricted = False
1578 for tc in self.get_testcases(include_onlymatching=False):
1579 if 'playlist' in tc:
1580 tc = tc['playlist'][0]
1581 is_restricted = age_restricted(
1582 tc.get('info_dict', {}).get('age_limit'), age_limit)
1583 if not is_restricted:
1585 any_restricted = any_restricted or is_restricted
1586 return not any_restricted
1588 def extract_subtitles(self, *args, **kwargs):
1589 if (self._downloader.params.get('writesubtitles', False) or
1590 self._downloader.params.get('listsubtitles')):
1591 return self._get_subtitles(*args, **kwargs)
1594 def _get_subtitles(self, *args, **kwargs):
1595 raise NotImplementedError('This method must be implemented by subclasses')
1598 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1599 """ Merge subtitle items for one language. Items with duplicated URLs
1600 will be dropped. """
1601 list1_urls = set([item['url'] for item in subtitle_list1])
1602 ret = list(subtitle_list1)
1603 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1607 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1608 """ Merge two subtitle dictionaries, language by language. """
1609 ret = dict(subtitle_dict1)
1610 for lang in subtitle_dict2:
1611 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1614 def extract_automatic_captions(self, *args, **kwargs):
1615 if (self._downloader.params.get('writeautomaticsub', False) or
1616 self._downloader.params.get('listsubtitles')):
1617 return self._get_automatic_captions(*args, **kwargs)
1620 def _get_automatic_captions(self, *args, **kwargs):
1621 raise NotImplementedError('This method must be implemented by subclasses')
1624 class SearchInfoExtractor(InfoExtractor):
1626 Base class for paged search queries extractors.
1627 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1628 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1632 def _make_valid_url(cls):
1633 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1636 def suitable(cls, url):
1637 return re.match(cls._make_valid_url(), url) is not None
1639 def _real_extract(self, query):
1640 mobj = re.match(self._make_valid_url(), query)
1642 raise ExtractorError('Invalid search query "%s"' % query)
1644 prefix = mobj.group('prefix')
1645 query = mobj.group('query')
1647 return self._get_n_results(query, 1)
1648 elif prefix == 'all':
1649 return self._get_n_results(query, self._MAX_RESULTS)
1653 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1654 elif n > self._MAX_RESULTS:
1655 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1656 n = self._MAX_RESULTS
1657 return self._get_n_results(query, n)
1659 def _get_n_results(self, query, n):
1660 """Get a specified number of results for a query"""
1661 raise NotImplementedError('This method must be implemented by subclasses')
1664 def SEARCH_KEY(self):
1665 return self._SEARCH_KEY