1 from __future__ import unicode_literals
15 from ..compat import (
24 compat_etree_fromstring,
53 class InfoExtractor(object):
54 """Information Extractor class.
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
60 passed to the YoutubeDL. The YoutubeDL processes this
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
64 The type field determines the type of the result.
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
68 For a video, the dictionaries must include the following fields:
71 title: Video title, unescaped.
73 Additionally, it must contain either a formats entry or a url one:
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
79 * url Mandatory. The URL of the video file
80 * ext Will be calculated from URL if missing
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
92 * resolution Textual description of width and height
93 * tbr Average bitrate of audio and video in KBit/s
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
96 * asr Audio sampling rate in Hertz
97 * vbr Average video bitrate in KBit/s
99 * vcodec Name of the video codec in use
100 * container Name of the container format
101 * filesize The number of bytes, if known in advance
102 * filesize_approx An estimate for the number of bytes
103 * player_url SWF Player URL (used for rtmpdump).
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
108 * preference Order number of this format. If this field is
109 present and not None, the formats get sorted
110 by this field, regardless of all other values.
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
131 * stretched_ratio If given and not 1, indicates that the
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
137 url: Final video URL.
138 ext: Video filename extension.
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
142 The following fields are optional:
144 alt_title: A secondary title of the video.
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
149 thumbnails: A list of dictionaries, with the following entries:
150 * "id" (optional, string) - Thumbnail format ID
152 * "preference" (optional, int) - quality of the image
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
157 thumbnail: Full URL to a video thumbnail image.
158 description: Full video description.
159 uploader: Full name of the video uploader.
160 license: License name the video is licensed under.
161 creator: The main artist who created the video.
162 release_date: The date (YYYYMMDD) when the video was released.
163 timestamp: UNIX timestamp of the moment the video became available.
164 upload_date: Video upload date (YYYYMMDD).
165 If not explicitly set, calculated from timestamp.
166 uploader_id: Nickname or id of the video uploader.
167 location: Physical location where the video was filmed.
168 subtitles: The available subtitles as a dictionary in the format
169 {language: subformats}. "subformats" is a list sorted from
170 lower to higher preference, each element is a dictionary
171 with the "ext" entry and one of:
172 * "data": The subtitles file contents
173 * "url": A URL pointing to the subtitles file
174 "ext" will be calculated from URL if missing
175 automatic_captions: Like 'subtitles', used by the YoutubeIE for
176 automatically generated captions
177 duration: Length of the video in seconds, as an integer or float.
178 view_count: How many users have watched the video on the platform.
179 like_count: Number of positive ratings of the video
180 dislike_count: Number of negative ratings of the video
181 repost_count: Number of reposts of the video
182 average_rating: Average rating give by users, the scale used depends on the webpage
183 comment_count: Number of comments on the video
184 comments: A list of comments, each with one or more of the following
185 properties (all but one of text or html optional):
186 * "author" - human-readable name of the comment author
187 * "author_id" - user ID of the comment author
189 * "html" - Comment as HTML
190 * "text" - Plain text of the comment
191 * "timestamp" - UNIX timestamp of comment
192 * "parent" - ID of the comment this one is replying to.
193 Set to "root" to indicate that this is a
194 comment to the original video.
195 age_limit: Age restriction for the video, as an integer (years)
196 webpage_url: The URL to the video webpage, if given to youtube-dl it
197 should allow to get the same result again. (It will be set
198 by YoutubeDL if it's missing)
199 categories: A list of categories that the video falls in, for example
201 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
202 is_live: True, False, or None (=unknown). Whether this video is a
203 live stream that goes on instead of a fixed-length video.
204 start_time: Time in seconds where the reproduction should start, as
205 specified in the URL.
206 end_time: Time in seconds where the reproduction should end, as
207 specified in the URL.
209 The following fields should only be used when the video belongs to some logical
212 chapter: Name or title of the chapter the video belongs to.
213 chapter_number: Number of the chapter the video belongs to, as an integer.
214 chapter_id: Id of the chapter the video belongs to, as a unicode string.
216 The following fields should only be used when the video is an episode of some
219 series: Title of the series or programme the video episode belongs to.
220 season: Title of the season the video episode belongs to.
221 season_number: Number of the season the video episode belongs to, as an integer.
222 season_id: Id of the season the video episode belongs to, as a unicode string.
223 episode: Title of the video episode. Unlike mandatory video title field,
224 this field should denote the exact title of the video episode
225 without any kind of decoration.
226 episode_number: Number of the video episode within a season, as an integer.
227 episode_id: Id of the video episode, as a unicode string.
229 Unless mentioned otherwise, the fields should be Unicode strings.
231 Unless mentioned otherwise, None is equivalent to absence of information.
234 _type "playlist" indicates multiple videos.
235 There must be a key "entries", which is a list, an iterable, or a PagedList
236 object, each element of which is a valid dictionary by this specification.
238 Additionally, playlists can have "title", "description" and "id" attributes
239 with the same semantics as videos (see above).
242 _type "multi_video" indicates that there are multiple videos that
243 form a single show, for examples multiple acts of an opera or TV episode.
244 It must have an entries key like a playlist and contain all the keys
245 required for a video at the same time.
248 _type "url" indicates that the video must be extracted from another
249 location, possibly by a different extractor. Its only required key is:
250 "url" - the next URL to extract.
251 The key "ie_key" can be set to the class name (minus the trailing "IE",
252 e.g. "Youtube") if the extractor class is known in advance.
253 Additionally, the dictionary may have any properties of the resolved entity
254 known in advance, for example "title" if the title of the referred video is
258 _type "url_transparent" entities have the same specification as "url", but
259 indicate that the given additional information is more precise than the one
260 associated with the resolved URL.
261 This is useful when a site employs a video service that hosts the video and
262 its technical metadata, but that video service does not embed a useful
263 title, description etc.
266 Subclasses of this one should re-define the _real_initialize() and
267 _real_extract() methods and define a _VALID_URL regexp.
268 Probably, they should also be added to the list of extractors.
270 Finally, the _WORKING attribute should be set to False for broken IEs
271 in order to warn the users and skip the tests.
278 def __init__(self, downloader=None):
279 """Constructor. Receives an optional downloader."""
281 self.set_downloader(downloader)
284 def suitable(cls, url):
285 """Receives a URL and returns True if suitable for this IE."""
287 # This does not use has/getattr intentionally - we want to know whether
288 # we have cached the regexp for *this* class, whereas getattr would also
289 # match the superclass
290 if '_VALID_URL_RE' not in cls.__dict__:
291 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
292 return cls._VALID_URL_RE.match(url) is not None
295 def _match_id(cls, url):
296 if '_VALID_URL_RE' not in cls.__dict__:
297 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
298 m = cls._VALID_URL_RE.match(url)
304 """Getter method for _WORKING."""
307 def initialize(self):
308 """Initializes an instance (authentication, etc)."""
310 self._real_initialize()
313 def extract(self, url):
314 """Extracts URL information and returns it in list of dicts."""
317 return self._real_extract(url)
318 except ExtractorError:
320 except compat_http_client.IncompleteRead as e:
321 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
322 except (KeyError, StopIteration) as e:
323 raise ExtractorError('An extractor error has occurred.', cause=e)
325 def set_downloader(self, downloader):
326 """Sets the downloader for this IE."""
327 self._downloader = downloader
329 def _real_initialize(self):
330 """Real initialization process. Redefine in subclasses."""
333 def _real_extract(self, url):
334 """Real extraction process. Redefine in subclasses."""
339 """A string for getting the InfoExtractor with get_info_extractor"""
340 return compat_str(cls.__name__[:-2])
344 return compat_str(type(self).__name__[:-2])
346 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
347 """ Returns the response handle """
349 self.report_download_webpage(video_id)
350 elif note is not False:
352 self.to_screen('%s' % (note,))
354 self.to_screen('%s: %s' % (video_id, note))
356 return self._downloader.urlopen(url_or_request)
357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
361 errnote = 'Unable to download webpage'
363 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
365 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
367 self._downloader.report_warning(errmsg)
370 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
371 """ Returns a tuple (page content as string, URL handle) """
372 # Strip hashes from the URL (#1038)
373 if isinstance(url_or_request, (compat_str, str)):
374 url_or_request = url_or_request.partition('#')[0]
376 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
380 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
381 return (content, urlh)
384 def _guess_encoding_from_content(content_type, webpage_bytes):
385 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
387 encoding = m.group(1)
389 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
390 webpage_bytes[:1024])
392 encoding = m.group(1).decode('ascii')
393 elif webpage_bytes.startswith(b'\xff\xfe'):
400 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
401 content_type = urlh.headers.get('Content-Type', '')
402 webpage_bytes = urlh.read()
403 if prefix is not None:
404 webpage_bytes = prefix + webpage_bytes
406 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
407 if self._downloader.params.get('dump_intermediate_pages', False):
409 url = url_or_request.get_full_url()
410 except AttributeError:
412 self.to_screen('Dumping request to ' + url)
413 dump = base64.b64encode(webpage_bytes).decode('ascii')
414 self._downloader.to_screen(dump)
415 if self._downloader.params.get('write_pages', False):
417 url = url_or_request.get_full_url()
418 except AttributeError:
420 basen = '%s_%s' % (video_id, url)
422 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
423 basen = basen[:240 - len(h)] + h
424 raw_filename = basen + '.dump'
425 filename = sanitize_filename(raw_filename, restricted=True)
426 self.to_screen('Saving request to ' + filename)
427 # Working around MAX_PATH limitation on Windows (see
428 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
430 absfilepath = os.path.abspath(filename)
431 if len(absfilepath) > 259:
432 filename = '\\\\?\\' + absfilepath
433 with open(filename, 'wb') as outf:
434 outf.write(webpage_bytes)
437 content = webpage_bytes.decode(encoding, 'replace')
439 content = webpage_bytes.decode('utf-8', 'replace')
441 if ('<title>Access to this site is blocked</title>' in content and
442 'Websense' in content[:512]):
443 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
444 blocked_iframe = self._html_search_regex(
445 r'<iframe src="([^"]+)"', content,
446 'Websense information URL', default=None)
448 msg += ' Visit %s for more details' % blocked_iframe
449 raise ExtractorError(msg, expected=True)
450 if '<title>The URL you requested has been blocked</title>' in content[:512]:
452 'Access to this webpage has been blocked by Indian censorship. '
453 'Use a VPN or proxy server (with --proxy) to route around it.')
454 block_msg = self._html_search_regex(
455 r'</h1><p>(.*?)</p>',
456 content, 'block message', default=None)
458 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
459 raise ExtractorError(msg, expected=True)
463 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
464 """ Returns the data of the page as a string """
467 while success is False:
469 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
471 except compat_http_client.IncompleteRead as e:
473 if try_count >= tries:
475 self._sleep(timeout, video_id)
482 def _download_xml(self, url_or_request, video_id,
483 note='Downloading XML', errnote='Unable to download XML',
484 transform_source=None, fatal=True, encoding=None):
485 """Return the xml as an xml.etree.ElementTree.Element"""
486 xml_string = self._download_webpage(
487 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
488 if xml_string is False:
491 xml_string = transform_source(xml_string)
492 return compat_etree_fromstring(xml_string.encode('utf-8'))
494 def _download_json(self, url_or_request, video_id,
495 note='Downloading JSON metadata',
496 errnote='Unable to download JSON metadata',
497 transform_source=None,
498 fatal=True, encoding=None):
499 json_string = self._download_webpage(
500 url_or_request, video_id, note, errnote, fatal=fatal,
502 if (not fatal) and json_string is False:
504 return self._parse_json(
505 json_string, video_id, transform_source=transform_source, fatal=fatal)
507 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
509 json_string = transform_source(json_string)
511 return json.loads(json_string)
512 except ValueError as ve:
513 errmsg = '%s: Failed to parse JSON ' % video_id
515 raise ExtractorError(errmsg, cause=ve)
517 self.report_warning(errmsg + str(ve))
519 def report_warning(self, msg, video_id=None):
520 idstr = '' if video_id is None else '%s: ' % video_id
521 self._downloader.report_warning(
522 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
524 def to_screen(self, msg):
525 """Print msg to screen, prefixing it with '[ie_name]'"""
526 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
528 def report_extraction(self, id_or_name):
529 """Report information extraction."""
530 self.to_screen('%s: Extracting information' % id_or_name)
532 def report_download_webpage(self, video_id):
533 """Report webpage download."""
534 self.to_screen('%s: Downloading webpage' % video_id)
536 def report_age_confirmation(self):
537 """Report attempt to confirm age."""
538 self.to_screen('Confirming age')
540 def report_login(self):
541 """Report attempt to log in."""
542 self.to_screen('Logging in')
545 def raise_login_required(msg='This video is only available for registered users'):
546 raise ExtractorError(
547 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
551 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
552 raise ExtractorError(
553 '%s. You might want to use --proxy to workaround.' % msg,
556 # Methods for following #608
558 def url_result(url, ie=None, video_id=None, video_title=None):
559 """Returns a URL that points to a page that should be processed"""
560 # TODO: ie should be the class used for getting the info
561 video_info = {'_type': 'url',
564 if video_id is not None:
565 video_info['id'] = video_id
566 if video_title is not None:
567 video_info['title'] = video_title
571 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
572 """Returns a playlist"""
573 video_info = {'_type': 'playlist',
576 video_info['id'] = playlist_id
578 video_info['title'] = playlist_title
579 if playlist_description:
580 video_info['description'] = playlist_description
583 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
585 Perform a regex search on the given string, using a single or a list of
586 patterns returning the first matching group.
587 In case of failure return a default value or raise a WARNING or a
588 RegexNotFoundError, depending on fatal, specifying the field name.
590 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
591 mobj = re.search(pattern, string, flags)
594 mobj = re.search(p, string, flags)
598 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
599 _name = '\033[0;34m%s\033[0m' % name
605 # return the first matching group
606 return next(g for g in mobj.groups() if g is not None)
608 return mobj.group(group)
609 elif default is not NO_DEFAULT:
612 raise RegexNotFoundError('Unable to extract %s' % _name)
614 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
617 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
619 Like _search_regex, but strips HTML tags and unescapes entities.
621 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
623 return clean_html(res).strip()
627 def _get_login_info(self):
629 Get the login info as (username, password)
630 It will look in the netrc file using the _NETRC_MACHINE value
631 If there's no info available, return (None, None)
633 if self._downloader is None:
638 downloader_params = self._downloader.params
640 # Attempt to use provided username and password or .netrc data
641 if downloader_params.get('username') is not None:
642 username = downloader_params['username']
643 password = downloader_params['password']
644 elif downloader_params.get('usenetrc', False):
646 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
651 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
652 except (IOError, netrc.NetrcParseError) as err:
653 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
655 return (username, password)
657 def _get_tfa_info(self, note='two-factor verification code'):
659 Get the two-factor authentication info
660 TODO - asking the user will be required for sms/phone verify
661 currently just uses the command line option
662 If there's no info available, return None
664 if self._downloader is None:
666 downloader_params = self._downloader.params
668 if downloader_params.get('twofactor') is not None:
669 return downloader_params['twofactor']
671 return compat_getpass('Type %s and press [Return]: ' % note)
673 # Helper functions for extracting OpenGraph info
675 def _og_regexes(prop):
676 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
677 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
678 % {'prop': re.escape(prop)})
679 template = r'<meta[^>]+?%s[^>]+?%s'
681 template % (property_re, content_re),
682 template % (content_re, property_re),
686 def _meta_regex(prop):
687 return r'''(?isx)<meta
688 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
689 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
691 def _og_search_property(self, prop, html, name=None, **kargs):
693 name = 'OpenGraph %s' % prop
694 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
697 return unescapeHTML(escaped)
699 def _og_search_thumbnail(self, html, **kargs):
700 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
702 def _og_search_description(self, html, **kargs):
703 return self._og_search_property('description', html, fatal=False, **kargs)
705 def _og_search_title(self, html, **kargs):
706 return self._og_search_property('title', html, **kargs)
708 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
709 regexes = self._og_regexes('video') + self._og_regexes('video:url')
711 regexes = self._og_regexes('video:secure_url') + regexes
712 return self._html_search_regex(regexes, html, name, **kargs)
714 def _og_search_url(self, html, **kargs):
715 return self._og_search_property('url', html, **kargs)
717 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
718 if display_name is None:
720 return self._html_search_regex(
721 self._meta_regex(name),
722 html, display_name, fatal=fatal, group='content', **kwargs)
724 def _dc_search_uploader(self, html):
725 return self._html_search_meta('dc.creator', html, 'uploader')
727 def _rta_search(self, html):
728 # See http://www.rtalabel.org/index.php?content=howtofaq#single
729 if re.search(r'(?ix)<meta\s+name="rating"\s+'
730 r' content="RTA-5042-1996-1400-1577-RTA"',
735 def _media_rating_search(self, html):
736 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
737 rating = self._html_search_meta('rating', html)
749 return RATING_TABLE.get(rating.lower())
751 def _family_friendly_search(self, html):
752 # See http://schema.org/VideoObject
753 family_friendly = self._html_search_meta('isFamilyFriendly', html)
755 if not family_friendly:
764 return RATING_TABLE.get(family_friendly.lower())
766 def _twitter_search_player(self, html):
767 return self._html_search_meta('twitter:player', html,
768 'twitter card player')
770 def _search_json_ld(self, html, video_id, **kwargs):
771 json_ld = self._search_regex(
772 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
773 html, 'JSON-LD', group='json_ld', **kwargs)
776 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
778 def _json_ld(self, json_ld, video_id, fatal=True):
779 if isinstance(json_ld, compat_str):
780 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
784 if json_ld.get('@context') == 'http://schema.org':
785 item_type = json_ld.get('@type')
786 if item_type == 'TVEpisode':
788 'episode': unescapeHTML(json_ld.get('name')),
789 'episode_number': int_or_none(json_ld.get('episodeNumber')),
790 'description': unescapeHTML(json_ld.get('description')),
792 part_of_season = json_ld.get('partOfSeason')
793 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
794 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
795 part_of_series = json_ld.get('partOfSeries')
796 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
797 info['series'] = unescapeHTML(part_of_series.get('name'))
798 elif item_type == 'Article':
800 'timestamp': parse_iso8601(json_ld.get('datePublished')),
801 'title': unescapeHTML(json_ld.get('headline')),
802 'description': unescapeHTML(json_ld.get('articleBody')),
804 return dict((k, v) for k, v in info.items() if v is not None)
807 def _hidden_inputs(html):
808 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
810 for input in re.findall(r'(?i)<input([^>]+)>', html):
811 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
813 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
816 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
819 hidden_inputs[name.group('value')] = value.group('value')
822 def _form_hidden_inputs(self, form_id, html):
823 form = self._search_regex(
824 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
825 html, '%s form' % form_id, group='form')
826 return self._hidden_inputs(form)
828 def _sort_formats(self, formats, field_preference=None):
830 raise ExtractorError('No video formats found')
833 # Automatically determine tbr when missing based on abr and vbr (improves
834 # formats sorting in some cases)
835 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
836 f['tbr'] = f['abr'] + f['vbr']
839 # TODO remove the following workaround
840 from ..utils import determine_ext
841 if not f.get('ext') and 'url' in f:
842 f['ext'] = determine_ext(f['url'])
844 if isinstance(field_preference, (list, tuple)):
845 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
847 preference = f.get('preference')
848 if preference is None:
850 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
853 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
855 if f.get('vcodec') == 'none': # audio only
856 if self._downloader.params.get('prefer_free_formats'):
857 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
859 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
862 audio_ext_preference = ORDER.index(f['ext'])
864 audio_ext_preference = -1
866 if self._downloader.params.get('prefer_free_formats'):
867 ORDER = ['flv', 'mp4', 'webm']
869 ORDER = ['webm', 'flv', 'mp4']
871 ext_preference = ORDER.index(f['ext'])
874 audio_ext_preference = 0
878 f.get('language_preference') if f.get('language_preference') is not None else -1,
879 f.get('quality') if f.get('quality') is not None else -1,
880 f.get('tbr') if f.get('tbr') is not None else -1,
881 f.get('filesize') if f.get('filesize') is not None else -1,
882 f.get('vbr') if f.get('vbr') is not None else -1,
883 f.get('height') if f.get('height') is not None else -1,
884 f.get('width') if f.get('width') is not None else -1,
887 f.get('abr') if f.get('abr') is not None else -1,
888 audio_ext_preference,
889 f.get('fps') if f.get('fps') is not None else -1,
890 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
891 f.get('source_preference') if f.get('source_preference') is not None else -1,
892 f.get('format_id') if f.get('format_id') is not None else '',
894 formats.sort(key=_formats_key)
896 def _check_formats(self, formats, video_id):
899 lambda f: self._is_valid_url(
901 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
905 def _remove_duplicate_formats(formats):
909 if f['url'] not in format_urls:
910 format_urls.add(f['url'])
911 unique_formats.append(f)
912 formats[:] = unique_formats
914 def _is_valid_url(self, url, video_id, item='video'):
915 url = self._proto_relative_url(url, scheme='http:')
916 # For now assume non HTTP(S) URLs always valid
917 if not (url.startswith('http://') or url.startswith('https://')):
920 self._request_webpage(url, video_id, 'Checking %s URL' % item)
922 except ExtractorError as e:
923 if isinstance(e.cause, compat_urllib_error.URLError):
925 '%s: %s URL is invalid, skipping' % (video_id, item))
929 def http_scheme(self):
930 """ Either "http:" or "https:", depending on the user's preferences """
933 if self._downloader.params.get('prefer_insecure', False)
936 def _proto_relative_url(self, url, scheme=None):
939 if url.startswith('//'):
941 scheme = self.http_scheme()
946 def _sleep(self, timeout, video_id, msg_template=None):
947 if msg_template is None:
948 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
949 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
953 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
954 transform_source=lambda s: fix_xml_ampersands(s).strip(),
956 manifest = self._download_xml(
957 manifest_url, video_id, 'Downloading f4m manifest',
958 'Unable to download f4m manifest',
959 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
960 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
961 transform_source=transform_source,
964 if manifest is False:
968 manifest_version = '1.0'
969 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
971 manifest_version = '2.0'
972 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
973 base_url = xpath_text(
974 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
975 'base URL', default=None)
977 base_url = base_url.strip()
978 for i, media_el in enumerate(media_nodes):
979 if manifest_version == '2.0':
980 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
984 media_url if media_url.startswith('http://') or media_url.startswith('https://')
985 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
986 # If media_url is itself a f4m manifest do the recursive extraction
987 # since bitrates in parent manifest (this one) and media_url manifest
988 # may differ leading to inability to resolve the format by requested
989 # bitrate in f4m downloader
990 if determine_ext(manifest_url) == 'f4m':
991 formats.extend(self._extract_f4m_formats(
992 manifest_url, video_id, preference, f4m_id, fatal=fatal))
994 tbr = int_or_none(media_el.attrib.get('bitrate'))
996 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1000 'width': int_or_none(media_el.attrib.get('width')),
1001 'height': int_or_none(media_el.attrib.get('height')),
1002 'preference': preference,
1004 self._sort_formats(formats)
1008 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1009 entry_protocol='m3u8', preference=None,
1010 m3u8_id=None, note=None, errnote=None,
1014 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1018 'preference': preference - 1 if preference else -1,
1019 'resolution': 'multiple',
1020 'format_note': 'Quality selection URL',
1023 format_url = lambda u: (
1025 if re.match(r'^https?://', u)
1026 else compat_urlparse.urljoin(m3u8_url, u))
1028 res = self._download_webpage_handle(
1030 note=note or 'Downloading m3u8 information',
1031 errnote=errnote or 'Failed to download m3u8 information',
1035 m3u8_doc, urlh = res
1036 m3u8_url = urlh.geturl()
1038 # We should try extracting formats only from master playlists [1], i.e.
1039 # playlists that describe available qualities. On the other hand media
1040 # playlists [2] should be returned as is since they contain just the media
1041 # without qualities renditions.
1042 # Fortunately, master playlist can be easily distinguished from media
1043 # playlist based on particular tags availability. As of [1, 2] master
1044 # playlist tags MUST NOT appear in a media playist and vice versa.
1045 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1046 # and MUST NOT appear in master playlist thus we can clearly detect media
1047 # playlist with this criterion.
1048 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1049 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1050 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1051 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1054 'format_id': m3u8_id,
1056 'protocol': entry_protocol,
1057 'preference': preference,
1061 kv_rex = re.compile(
1062 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1063 for line in m3u8_doc.splitlines():
1064 if line.startswith('#EXT-X-STREAM-INF:'):
1066 for m in kv_rex.finditer(line):
1068 if v.startswith('"'):
1070 last_info[m.group('key')] = v
1071 elif line.startswith('#EXT-X-MEDIA:'):
1073 for m in kv_rex.finditer(line):
1075 if v.startswith('"'):
1077 last_media[m.group('key')] = v
1078 elif line.startswith('#') or not line.strip():
1081 if last_info is None:
1082 formats.append({'url': format_url(line)})
1084 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1087 format_id.append(m3u8_id)
1088 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1089 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1091 'format_id': '-'.join(format_id),
1092 'url': format_url(line.strip()),
1095 'protocol': entry_protocol,
1096 'preference': preference,
1098 resolution = last_info.get('RESOLUTION')
1100 width_str, height_str = resolution.split('x')
1101 f['width'] = int(width_str)
1102 f['height'] = int(height_str)
1103 codecs = last_info.get('CODECS')
1105 vcodec, acodec = [None] * 2
1106 va_codecs = codecs.split(',')
1107 if len(va_codecs) == 1:
1108 # Audio only entries usually come with single codec and
1109 # no resolution. For more robustness we also check it to
1111 if not resolution and va_codecs[0].startswith('mp4a'):
1112 vcodec, acodec = 'none', va_codecs[0]
1114 vcodec = va_codecs[0]
1116 vcodec, acodec = va_codecs[:2]
1121 if last_media is not None:
1122 f['m3u8_media'] = last_media
1126 self._sort_formats(formats)
1130 def _xpath_ns(path, namespace=None):
1134 for c in path.split('/'):
1135 if not c or c == '.':
1138 out.append('{%s}%s' % (namespace, c))
1139 return '/'.join(out)
1141 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1142 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1148 namespace = self._parse_smil_namespace(smil)
1150 return self._parse_smil_formats(
1151 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1153 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1154 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1157 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1159 def _download_smil(self, smil_url, video_id, fatal=True):
1160 return self._download_xml(
1161 smil_url, video_id, 'Downloading SMIL file',
1162 'Unable to download SMIL file', fatal=fatal)
1164 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1165 namespace = self._parse_smil_namespace(smil)
1167 formats = self._parse_smil_formats(
1168 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1169 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1171 video_id = os.path.splitext(url_basename(smil_url))[0]
1175 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1176 name = meta.attrib.get('name')
1177 content = meta.attrib.get('content')
1178 if not name or not content:
1180 if not title and name == 'title':
1182 elif not description and name in ('description', 'abstract'):
1183 description = content
1184 elif not upload_date and name == 'date':
1185 upload_date = unified_strdate(content)
1188 'id': image.get('type'),
1189 'url': image.get('src'),
1190 'width': int_or_none(image.get('width')),
1191 'height': int_or_none(image.get('height')),
1192 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1196 'title': title or video_id,
1197 'description': description,
1198 'upload_date': upload_date,
1199 'thumbnails': thumbnails,
1201 'subtitles': subtitles,
1204 def _parse_smil_namespace(self, smil):
1205 return self._search_regex(
1206 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1208 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1210 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1211 b = meta.get('base') or meta.get('httpBase')
1222 videos = smil.findall(self._xpath_ns('.//video', namespace))
1223 for video in videos:
1224 src = video.get('src')
1225 if not src or src in srcs:
1229 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1230 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1231 width = int_or_none(video.get('width'))
1232 height = int_or_none(video.get('height'))
1233 proto = video.get('proto')
1234 ext = video.get('ext')
1235 src_ext = determine_ext(src)
1236 streamer = video.get('streamer') or base
1238 if proto == 'rtmp' or streamer.startswith('rtmp'):
1244 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1246 'filesize': filesize,
1250 if transform_rtmp_url:
1251 streamer, src = transform_rtmp_url(streamer, src)
1252 formats[-1].update({
1258 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1259 src_url = src_url.strip()
1261 if proto == 'm3u8' or src_ext == 'm3u8':
1262 m3u8_formats = self._extract_m3u8_formats(
1263 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1264 if len(m3u8_formats) == 1:
1266 m3u8_formats[0].update({
1267 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1272 formats.extend(m3u8_formats)
1275 if src_ext == 'f4m':
1280 'plugin': 'flowplayer-3.2.0.1',
1282 f4m_url += '&' if '?' in f4m_url else '?'
1283 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1284 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1287 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1291 'ext': ext or src_ext or 'flv',
1292 'format_id': 'http-%d' % (bitrate or http_count),
1294 'filesize': filesize,
1300 self._sort_formats(formats)
1304 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1307 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1308 src = textstream.get('src')
1309 if not src or src in urls:
1312 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1313 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1314 subtitles.setdefault(lang, []).append({
1320 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1321 xspf = self._download_xml(
1322 playlist_url, playlist_id, 'Downloading xpsf playlist',
1323 'Unable to download xspf manifest', fatal=fatal)
1326 return self._parse_xspf(xspf, playlist_id)
1328 def _parse_xspf(self, playlist, playlist_id):
1330 'xspf': 'http://xspf.org/ns/0/',
1331 's1': 'http://static.streamone.nl/player/ns/0',
1335 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1337 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1338 description = xpath_text(
1339 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1340 thumbnail = xpath_text(
1341 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1342 duration = float_or_none(
1343 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1346 'url': location.text,
1347 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1348 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1349 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1350 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1351 self._sort_formats(formats)
1356 'description': description,
1357 'thumbnail': thumbnail,
1358 'duration': duration,
1363 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1364 res = self._download_webpage_handle(
1366 note=note or 'Downloading MPD manifest',
1367 errnote=errnote or 'Failed to download MPD manifest',
1372 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1374 return self._parse_mpd_formats(
1375 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1377 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1378 if mpd_doc.get('type') == 'dynamic':
1381 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1384 return self._xpath_ns(path, namespace)
1386 def is_drm_protected(element):
1387 return element.find(_add_ns('ContentProtection')) is not None
1389 def extract_multisegment_info(element, ms_parent_info):
1390 ms_info = ms_parent_info.copy()
1391 segment_list = element.find(_add_ns('SegmentList'))
1392 if segment_list is not None:
1393 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1395 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1396 initialization = segment_list.find(_add_ns('Initialization'))
1397 if initialization is not None:
1398 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1400 segment_template = element.find(_add_ns('SegmentTemplate'))
1401 if segment_template is not None:
1402 start_number = segment_template.get('startNumber')
1404 ms_info['start_number'] = int(start_number)
1405 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1406 if segment_timeline is not None:
1407 s_e = segment_timeline.findall(_add_ns('S'))
1409 ms_info['total_number'] = 0
1411 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1413 timescale = segment_template.get('timescale')
1415 ms_info['timescale'] = int(timescale)
1416 segment_duration = segment_template.get('duration')
1417 if segment_duration:
1418 ms_info['segment_duration'] = int(segment_duration)
1419 media_template = segment_template.get('media')
1421 ms_info['media_template'] = media_template
1422 initialization = segment_template.get('initialization')
1424 ms_info['initialization_url'] = initialization
1426 initialization = segment_template.find(_add_ns('Initialization'))
1427 if initialization is not None:
1428 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1431 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1433 for period in mpd_doc.findall(_add_ns('Period')):
1434 period_duration = parse_duration(period.get('duration')) or mpd_duration
1435 period_ms_info = extract_multisegment_info(period, {
1439 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1440 if is_drm_protected(adaptation_set):
1442 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1443 for representation in adaptation_set.findall(_add_ns('Representation')):
1444 if is_drm_protected(representation):
1446 representation_attrib = adaptation_set.attrib.copy()
1447 representation_attrib.update(representation.attrib)
1448 mime_type = representation_attrib.get('mimeType')
1449 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1450 if content_type == 'text':
1451 # TODO implement WebVTT downloading
1453 elif content_type == 'video' or content_type == 'audio':
1455 for element in (representation, adaptation_set, period, mpd_doc):
1456 base_url_e = element.find(_add_ns('BaseURL'))
1457 if base_url_e is not None:
1458 base_url = base_url_e.text + base_url
1459 if re.match(r'^https?://', base_url):
1461 if mpd_base_url and not re.match(r'^https?://', base_url):
1462 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1464 base_url = mpd_base_url + base_url
1465 representation_id = representation_attrib.get('id')
1466 lang = representation_attrib.get('lang')
1467 url_el = representation.find(_add_ns('BaseURL'))
1468 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1470 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1472 'width': int_or_none(representation_attrib.get('width')),
1473 'height': int_or_none(representation_attrib.get('height')),
1474 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1475 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1476 'fps': int_or_none(representation_attrib.get('frameRate')),
1477 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1478 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1479 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1480 'format_note': 'DASH %s' % content_type,
1481 'filesize': filesize,
1483 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1484 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1485 if 'total_number' not in representation_ms_info and 'segment_duration':
1486 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1487 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1488 media_template = representation_ms_info['media_template']
1489 media_template = media_template.replace('$RepresentationID$', representation_id)
1490 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1491 media_template.replace('$$', '$')
1492 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1493 if 'segment_urls' in representation_ms_info:
1495 'segment_urls': representation_ms_info['segment_urls'],
1496 'protocol': 'http_dash_segments',
1498 if 'initialization_url' in representation_ms_info:
1499 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1501 'initialization_url': initialization_url,
1503 if not f.get('url'):
1504 f['url'] = initialization_url
1506 existing_format = next(
1507 fo for fo in formats
1508 if fo['format_id'] == representation_id)
1509 except StopIteration:
1510 full_info = formats_dict.get(representation_id, {}).copy()
1512 formats.append(full_info)
1514 existing_format.update(f)
1516 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1517 self._sort_formats(formats)
1520 def _live_title(self, name):
1521 """ Generate the title for a live video """
1522 now = datetime.datetime.now()
1523 now_str = now.strftime('%Y-%m-%d %H:%M')
1524 return name + ' ' + now_str
1526 def _int(self, v, name, fatal=False, **kwargs):
1527 res = int_or_none(v, **kwargs)
1528 if 'get_attr' in kwargs:
1529 print(getattr(v, kwargs['get_attr']))
1531 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1533 raise ExtractorError(msg)
1535 self._downloader.report_warning(msg)
1538 def _float(self, v, name, fatal=False, **kwargs):
1539 res = float_or_none(v, **kwargs)
1541 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1543 raise ExtractorError(msg)
1545 self._downloader.report_warning(msg)
1548 def _set_cookie(self, domain, name, value, expire_time=None):
1549 cookie = compat_cookiejar.Cookie(
1550 0, name, value, None, None, domain, None,
1551 None, '/', True, False, expire_time, '', None, None, None)
1552 self._downloader.cookiejar.set_cookie(cookie)
1554 def _get_cookies(self, url):
1555 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1556 req = sanitized_Request(url)
1557 self._downloader.cookiejar.add_cookie_header(req)
1558 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1560 def get_testcases(self, include_onlymatching=False):
1561 t = getattr(self, '_TEST', None)
1563 assert not hasattr(self, '_TESTS'), \
1564 '%s has _TEST and _TESTS' % type(self).__name__
1567 tests = getattr(self, '_TESTS', [])
1569 if not include_onlymatching and t.get('only_matching', False):
1571 t['name'] = type(self).__name__[:-len('IE')]
1574 def is_suitable(self, age_limit):
1575 """ Test whether the extractor is generally suitable for the given
1576 age limit (i.e. pornographic sites are not, all others usually are) """
1578 any_restricted = False
1579 for tc in self.get_testcases(include_onlymatching=False):
1580 if 'playlist' in tc:
1581 tc = tc['playlist'][0]
1582 is_restricted = age_restricted(
1583 tc.get('info_dict', {}).get('age_limit'), age_limit)
1584 if not is_restricted:
1586 any_restricted = any_restricted or is_restricted
1587 return not any_restricted
1589 def extract_subtitles(self, *args, **kwargs):
1590 if (self._downloader.params.get('writesubtitles', False) or
1591 self._downloader.params.get('listsubtitles')):
1592 return self._get_subtitles(*args, **kwargs)
1595 def _get_subtitles(self, *args, **kwargs):
1596 raise NotImplementedError('This method must be implemented by subclasses')
1599 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1600 """ Merge subtitle items for one language. Items with duplicated URLs
1601 will be dropped. """
1602 list1_urls = set([item['url'] for item in subtitle_list1])
1603 ret = list(subtitle_list1)
1604 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1608 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1609 """ Merge two subtitle dictionaries, language by language. """
1610 ret = dict(subtitle_dict1)
1611 for lang in subtitle_dict2:
1612 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1615 def extract_automatic_captions(self, *args, **kwargs):
1616 if (self._downloader.params.get('writeautomaticsub', False) or
1617 self._downloader.params.get('listsubtitles')):
1618 return self._get_automatic_captions(*args, **kwargs)
1621 def _get_automatic_captions(self, *args, **kwargs):
1622 raise NotImplementedError('This method must be implemented by subclasses')
1624 def mark_watched(self, *args, **kwargs):
1625 if (self._downloader.params.get('mark_watched', False) and
1626 (self._get_login_info()[0] is not None or
1627 self._downloader.params.get('cookiefile') is not None)):
1628 self._mark_watched(*args, **kwargs)
1630 def _mark_watched(self, *args, **kwargs):
1631 raise NotImplementedError('This method must be implemented by subclasses')
1634 class SearchInfoExtractor(InfoExtractor):
1636 Base class for paged search queries extractors.
1637 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1638 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1642 def _make_valid_url(cls):
1643 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1646 def suitable(cls, url):
1647 return re.match(cls._make_valid_url(), url) is not None
1649 def _real_extract(self, query):
1650 mobj = re.match(self._make_valid_url(), query)
1652 raise ExtractorError('Invalid search query "%s"' % query)
1654 prefix = mobj.group('prefix')
1655 query = mobj.group('query')
1657 return self._get_n_results(query, 1)
1658 elif prefix == 'all':
1659 return self._get_n_results(query, self._MAX_RESULTS)
1663 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1664 elif n > self._MAX_RESULTS:
1665 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1666 n = self._MAX_RESULTS
1667 return self._get_n_results(query, n)
1669 def _get_n_results(self, query, n):
1670 """Get a specified number of results for a query"""
1671 raise NotImplementedError('This method must be implemented by subclasses')
1674 def SEARCH_KEY(self):
1675 return self._SEARCH_KEY