1 from __future__ import unicode_literals
15 from ..compat import (
18 compat_etree_fromstring,
55 class InfoExtractor(object):
56 """Information Extractor class.
58 Information extractors are the classes that, given a URL, extract
59 information about the video (or videos) the URL refers to. This
60 information includes the real video URL, the video title, author and
61 others. The information is stored in a dictionary which is then
62 passed to the YoutubeDL. The YoutubeDL processes this
63 information possibly downloading the video to the file system, among
64 other possible outcomes.
66 The type field determines the type of the result.
67 By far the most common value (and the default if _type is missing) is
68 "video", which indicates a single video.
70 For a video, the dictionaries must include the following fields:
73 title: Video title, unescaped.
75 Additionally, it must contain either a formats entry or a url one:
77 formats: A list of dictionaries for each format available, ordered
78 from worst to best quality.
81 * url Mandatory. The URL of the video file
82 * ext Will be calculated from URL if missing
83 * format A human-readable description of the format
84 ("mp4 container with h264/opus").
85 Calculated from the format_id, width, height.
86 and format_note fields if missing.
87 * format_id A short description of the format
88 ("mp4_h264_opus" or "19").
89 Technically optional, but strongly recommended.
90 * format_note Additional info about the format
91 ("3D" or "DASH video")
92 * width Width of the video, if known
93 * height Height of the video, if known
94 * resolution Textual description of width and height
95 * tbr Average bitrate of audio and video in KBit/s
96 * abr Average audio bitrate in KBit/s
97 * acodec Name of the audio codec in use
98 * asr Audio sampling rate in Hertz
99 * vbr Average video bitrate in KBit/s
101 * vcodec Name of the video codec in use
102 * container Name of the container format
103 * filesize The number of bytes, if known in advance
104 * filesize_approx An estimate for the number of bytes
105 * player_url SWF Player URL (used for rtmpdump).
106 * protocol The protocol that will be used for the actual
107 download, lower-case.
108 "http", "https", "rtsp", "rtmp", "rtmpe",
109 "m3u8", "m3u8_native" or "http_dash_segments".
110 * preference Order number of this format. If this field is
111 present and not None, the formats get sorted
112 by this field, regardless of all other values.
113 -1 for default (order by other properties),
114 -2 or smaller for less than default.
115 < -1000 to hide the format (if there is
116 another one which is strictly better)
117 * language Language code, e.g. "de" or "en-US".
118 * language_preference Is this in the language mentioned in
120 10 if it's what the URL is about,
121 -1 for default (don't know),
122 -10 otherwise, other values reserved for now.
123 * quality Order number of the video quality of this
124 format, irrespective of the file format.
125 -1 for default (order by other properties),
126 -2 or smaller for less than default.
127 * source_preference Order number for this video source
128 (quality takes higher priority)
129 -1 for default (order by other properties),
130 -2 or smaller for less than default.
131 * http_headers A dictionary of additional HTTP headers
132 to add to the request.
133 * stretched_ratio If given and not 1, indicates that the
134 video's pixels are not square.
135 width : height ratio as float.
136 * no_resume The server does not support resuming the
137 (HTTP or RTMP) download. Boolean.
139 url: Final video URL.
140 ext: Video filename extension.
141 format: The video format, defaults to ext (used for --get-format)
142 player_url: SWF Player URL (used for rtmpdump).
144 The following fields are optional:
146 alt_title: A secondary title of the video.
147 display_id An alternative identifier for the video, not necessarily
148 unique, but available before title. Typically, id is
149 something like "4234987", title "Dancing naked mole rats",
150 and display_id "dancing-naked-mole-rats"
151 thumbnails: A list of dictionaries, with the following entries:
152 * "id" (optional, string) - Thumbnail format ID
154 * "preference" (optional, int) - quality of the image
155 * "width" (optional, int)
156 * "height" (optional, int)
157 * "resolution" (optional, string "{width}x{height"},
159 thumbnail: Full URL to a video thumbnail image.
160 description: Full video description.
161 uploader: Full name of the video uploader.
162 license: License name the video is licensed under.
163 creator: The main artist who created the video.
164 release_date: The date (YYYYMMDD) when the video was released.
165 timestamp: UNIX timestamp of the moment the video became available.
166 upload_date: Video upload date (YYYYMMDD).
167 If not explicitly set, calculated from timestamp.
168 uploader_id: Nickname or id of the video uploader.
169 uploader_url: Full URL to a personal webpage of the video uploader.
170 location: Physical location where the video was filmed.
171 subtitles: The available subtitles as a dictionary in the format
172 {language: subformats}. "subformats" is a list sorted from
173 lower to higher preference, each element is a dictionary
174 with the "ext" entry and one of:
175 * "data": The subtitles file contents
176 * "url": A URL pointing to the subtitles file
177 "ext" will be calculated from URL if missing
178 automatic_captions: Like 'subtitles', used by the YoutubeIE for
179 automatically generated captions
180 duration: Length of the video in seconds, as an integer or float.
181 view_count: How many users have watched the video on the platform.
182 like_count: Number of positive ratings of the video
183 dislike_count: Number of negative ratings of the video
184 repost_count: Number of reposts of the video
185 average_rating: Average rating give by users, the scale used depends on the webpage
186 comment_count: Number of comments on the video
187 comments: A list of comments, each with one or more of the following
188 properties (all but one of text or html optional):
189 * "author" - human-readable name of the comment author
190 * "author_id" - user ID of the comment author
192 * "html" - Comment as HTML
193 * "text" - Plain text of the comment
194 * "timestamp" - UNIX timestamp of comment
195 * "parent" - ID of the comment this one is replying to.
196 Set to "root" to indicate that this is a
197 comment to the original video.
198 age_limit: Age restriction for the video, as an integer (years)
199 webpage_url: The URL to the video webpage, if given to youtube-dl it
200 should allow to get the same result again. (It will be set
201 by YoutubeDL if it's missing)
202 categories: A list of categories that the video falls in, for example
204 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
205 is_live: True, False, or None (=unknown). Whether this video is a
206 live stream that goes on instead of a fixed-length video.
207 start_time: Time in seconds where the reproduction should start, as
208 specified in the URL.
209 end_time: Time in seconds where the reproduction should end, as
210 specified in the URL.
212 The following fields should only be used when the video belongs to some logical
215 chapter: Name or title of the chapter the video belongs to.
216 chapter_number: Number of the chapter the video belongs to, as an integer.
217 chapter_id: Id of the chapter the video belongs to, as a unicode string.
219 The following fields should only be used when the video is an episode of some
222 series: Title of the series or programme the video episode belongs to.
223 season: Title of the season the video episode belongs to.
224 season_number: Number of the season the video episode belongs to, as an integer.
225 season_id: Id of the season the video episode belongs to, as a unicode string.
226 episode: Title of the video episode. Unlike mandatory video title field,
227 this field should denote the exact title of the video episode
228 without any kind of decoration.
229 episode_number: Number of the video episode within a season, as an integer.
230 episode_id: Id of the video episode, as a unicode string.
232 Unless mentioned otherwise, the fields should be Unicode strings.
234 Unless mentioned otherwise, None is equivalent to absence of information.
237 _type "playlist" indicates multiple videos.
238 There must be a key "entries", which is a list, an iterable, or a PagedList
239 object, each element of which is a valid dictionary by this specification.
241 Additionally, playlists can have "title", "description" and "id" attributes
242 with the same semantics as videos (see above).
245 _type "multi_video" indicates that there are multiple videos that
246 form a single show, for examples multiple acts of an opera or TV episode.
247 It must have an entries key like a playlist and contain all the keys
248 required for a video at the same time.
251 _type "url" indicates that the video must be extracted from another
252 location, possibly by a different extractor. Its only required key is:
253 "url" - the next URL to extract.
254 The key "ie_key" can be set to the class name (minus the trailing "IE",
255 e.g. "Youtube") if the extractor class is known in advance.
256 Additionally, the dictionary may have any properties of the resolved entity
257 known in advance, for example "title" if the title of the referred video is
261 _type "url_transparent" entities have the same specification as "url", but
262 indicate that the given additional information is more precise than the one
263 associated with the resolved URL.
264 This is useful when a site employs a video service that hosts the video and
265 its technical metadata, but that video service does not embed a useful
266 title, description etc.
269 Subclasses of this one should re-define the _real_initialize() and
270 _real_extract() methods and define a _VALID_URL regexp.
271 Probably, they should also be added to the list of extractors.
273 Finally, the _WORKING attribute should be set to False for broken IEs
274 in order to warn the users and skip the tests.
281 def __init__(self, downloader=None):
282 """Constructor. Receives an optional downloader."""
284 self.set_downloader(downloader)
287 def suitable(cls, url):
288 """Receives a URL and returns True if suitable for this IE."""
290 # This does not use has/getattr intentionally - we want to know whether
291 # we have cached the regexp for *this* class, whereas getattr would also
292 # match the superclass
293 if '_VALID_URL_RE' not in cls.__dict__:
294 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
295 return cls._VALID_URL_RE.match(url) is not None
298 def _match_id(cls, url):
299 if '_VALID_URL_RE' not in cls.__dict__:
300 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
301 m = cls._VALID_URL_RE.match(url)
307 """Getter method for _WORKING."""
310 def initialize(self):
311 """Initializes an instance (authentication, etc)."""
313 self._real_initialize()
316 def extract(self, url):
317 """Extracts URL information and returns it in list of dicts."""
320 return self._real_extract(url)
321 except ExtractorError:
323 except compat_http_client.IncompleteRead as e:
324 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
325 except (KeyError, StopIteration) as e:
326 raise ExtractorError('An extractor error has occurred.', cause=e)
328 def set_downloader(self, downloader):
329 """Sets the downloader for this IE."""
330 self._downloader = downloader
332 def _real_initialize(self):
333 """Real initialization process. Redefine in subclasses."""
336 def _real_extract(self, url):
337 """Real extraction process. Redefine in subclasses."""
342 """A string for getting the InfoExtractor with get_info_extractor"""
343 return compat_str(cls.__name__[:-2])
347 return compat_str(type(self).__name__[:-2])
349 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
350 """ Returns the response handle """
352 self.report_download_webpage(video_id)
353 elif note is not False:
355 self.to_screen('%s' % (note,))
357 self.to_screen('%s: %s' % (video_id, note))
359 return self._downloader.urlopen(url_or_request)
360 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 errnote = 'Unable to download webpage'
366 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
368 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
370 self._downloader.report_warning(errmsg)
373 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
374 """ Returns a tuple (page content as string, URL handle) """
375 # Strip hashes from the URL (#1038)
376 if isinstance(url_or_request, (compat_str, str)):
377 url_or_request = url_or_request.partition('#')[0]
379 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
383 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
384 return (content, urlh)
387 def _guess_encoding_from_content(content_type, webpage_bytes):
388 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
390 encoding = m.group(1)
392 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
393 webpage_bytes[:1024])
395 encoding = m.group(1).decode('ascii')
396 elif webpage_bytes.startswith(b'\xff\xfe'):
403 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
404 content_type = urlh.headers.get('Content-Type', '')
405 webpage_bytes = urlh.read()
406 if prefix is not None:
407 webpage_bytes = prefix + webpage_bytes
409 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
410 if self._downloader.params.get('dump_intermediate_pages', False):
412 url = url_or_request.get_full_url()
413 except AttributeError:
415 self.to_screen('Dumping request to ' + url)
416 dump = base64.b64encode(webpage_bytes).decode('ascii')
417 self._downloader.to_screen(dump)
418 if self._downloader.params.get('write_pages', False):
420 url = url_or_request.get_full_url()
421 except AttributeError:
423 basen = '%s_%s' % (video_id, url)
425 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
426 basen = basen[:240 - len(h)] + h
427 raw_filename = basen + '.dump'
428 filename = sanitize_filename(raw_filename, restricted=True)
429 self.to_screen('Saving request to ' + filename)
430 # Working around MAX_PATH limitation on Windows (see
431 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
432 if compat_os_name == 'nt':
433 absfilepath = os.path.abspath(filename)
434 if len(absfilepath) > 259:
435 filename = '\\\\?\\' + absfilepath
436 with open(filename, 'wb') as outf:
437 outf.write(webpage_bytes)
440 content = webpage_bytes.decode(encoding, 'replace')
442 content = webpage_bytes.decode('utf-8', 'replace')
444 if ('<title>Access to this site is blocked</title>' in content and
445 'Websense' in content[:512]):
446 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
447 blocked_iframe = self._html_search_regex(
448 r'<iframe src="([^"]+)"', content,
449 'Websense information URL', default=None)
451 msg += ' Visit %s for more details' % blocked_iframe
452 raise ExtractorError(msg, expected=True)
453 if '<title>The URL you requested has been blocked</title>' in content[:512]:
455 'Access to this webpage has been blocked by Indian censorship. '
456 'Use a VPN or proxy server (with --proxy) to route around it.')
457 block_msg = self._html_search_regex(
458 r'</h1><p>(.*?)</p>',
459 content, 'block message', default=None)
461 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
462 raise ExtractorError(msg, expected=True)
466 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
467 """ Returns the data of the page as a string """
470 while success is False:
472 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
474 except compat_http_client.IncompleteRead as e:
476 if try_count >= tries:
478 self._sleep(timeout, video_id)
485 def _download_xml(self, url_or_request, video_id,
486 note='Downloading XML', errnote='Unable to download XML',
487 transform_source=None, fatal=True, encoding=None):
488 """Return the xml as an xml.etree.ElementTree.Element"""
489 xml_string = self._download_webpage(
490 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
491 if xml_string is False:
494 xml_string = transform_source(xml_string)
495 return compat_etree_fromstring(xml_string.encode('utf-8'))
497 def _download_json(self, url_or_request, video_id,
498 note='Downloading JSON metadata',
499 errnote='Unable to download JSON metadata',
500 transform_source=None,
501 fatal=True, encoding=None):
502 json_string = self._download_webpage(
503 url_or_request, video_id, note, errnote, fatal=fatal,
505 if (not fatal) and json_string is False:
507 return self._parse_json(
508 json_string, video_id, transform_source=transform_source, fatal=fatal)
510 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
512 json_string = transform_source(json_string)
514 return json.loads(json_string)
515 except ValueError as ve:
516 errmsg = '%s: Failed to parse JSON ' % video_id
518 raise ExtractorError(errmsg, cause=ve)
520 self.report_warning(errmsg + str(ve))
522 def report_warning(self, msg, video_id=None):
523 idstr = '' if video_id is None else '%s: ' % video_id
524 self._downloader.report_warning(
525 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
527 def to_screen(self, msg):
528 """Print msg to screen, prefixing it with '[ie_name]'"""
529 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
531 def report_extraction(self, id_or_name):
532 """Report information extraction."""
533 self.to_screen('%s: Extracting information' % id_or_name)
535 def report_download_webpage(self, video_id):
536 """Report webpage download."""
537 self.to_screen('%s: Downloading webpage' % video_id)
539 def report_age_confirmation(self):
540 """Report attempt to confirm age."""
541 self.to_screen('Confirming age')
543 def report_login(self):
544 """Report attempt to log in."""
545 self.to_screen('Logging in')
548 def raise_login_required(msg='This video is only available for registered users'):
549 raise ExtractorError(
550 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
554 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
555 raise ExtractorError(
556 '%s. You might want to use --proxy to workaround.' % msg,
559 # Methods for following #608
561 def url_result(url, ie=None, video_id=None, video_title=None):
562 """Returns a URL that points to a page that should be processed"""
563 # TODO: ie should be the class used for getting the info
564 video_info = {'_type': 'url',
567 if video_id is not None:
568 video_info['id'] = video_id
569 if video_title is not None:
570 video_info['title'] = video_title
574 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
575 """Returns a playlist"""
576 video_info = {'_type': 'playlist',
579 video_info['id'] = playlist_id
581 video_info['title'] = playlist_title
582 if playlist_description:
583 video_info['description'] = playlist_description
586 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
588 Perform a regex search on the given string, using a single or a list of
589 patterns returning the first matching group.
590 In case of failure return a default value or raise a WARNING or a
591 RegexNotFoundError, depending on fatal, specifying the field name.
593 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
594 mobj = re.search(pattern, string, flags)
597 mobj = re.search(p, string, flags)
601 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
602 _name = '\033[0;34m%s\033[0m' % name
608 # return the first matching group
609 return next(g for g in mobj.groups() if g is not None)
611 return mobj.group(group)
612 elif default is not NO_DEFAULT:
615 raise RegexNotFoundError('Unable to extract %s' % _name)
617 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
620 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
622 Like _search_regex, but strips HTML tags and unescapes entities.
624 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
626 return clean_html(res).strip()
630 def _get_login_info(self):
632 Get the login info as (username, password)
633 It will look in the netrc file using the _NETRC_MACHINE value
634 If there's no info available, return (None, None)
636 if self._downloader is None:
641 downloader_params = self._downloader.params
643 # Attempt to use provided username and password or .netrc data
644 if downloader_params.get('username') is not None:
645 username = downloader_params['username']
646 password = downloader_params['password']
647 elif downloader_params.get('usenetrc', False):
649 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
654 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
655 except (IOError, netrc.NetrcParseError) as err:
656 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
658 return (username, password)
660 def _get_tfa_info(self, note='two-factor verification code'):
662 Get the two-factor authentication info
663 TODO - asking the user will be required for sms/phone verify
664 currently just uses the command line option
665 If there's no info available, return None
667 if self._downloader is None:
669 downloader_params = self._downloader.params
671 if downloader_params.get('twofactor') is not None:
672 return downloader_params['twofactor']
674 return compat_getpass('Type %s and press [Return]: ' % note)
676 # Helper functions for extracting OpenGraph info
678 def _og_regexes(prop):
679 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
680 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
681 % {'prop': re.escape(prop)})
682 template = r'<meta[^>]+?%s[^>]+?%s'
684 template % (property_re, content_re),
685 template % (content_re, property_re),
689 def _meta_regex(prop):
690 return r'''(?isx)<meta
691 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
692 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
694 def _og_search_property(self, prop, html, name=None, **kargs):
696 name = 'OpenGraph %s' % prop
697 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
700 return unescapeHTML(escaped)
702 def _og_search_thumbnail(self, html, **kargs):
703 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
705 def _og_search_description(self, html, **kargs):
706 return self._og_search_property('description', html, fatal=False, **kargs)
708 def _og_search_title(self, html, **kargs):
709 return self._og_search_property('title', html, **kargs)
711 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
712 regexes = self._og_regexes('video') + self._og_regexes('video:url')
714 regexes = self._og_regexes('video:secure_url') + regexes
715 return self._html_search_regex(regexes, html, name, **kargs)
717 def _og_search_url(self, html, **kargs):
718 return self._og_search_property('url', html, **kargs)
720 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
721 if display_name is None:
723 return self._html_search_regex(
724 self._meta_regex(name),
725 html, display_name, fatal=fatal, group='content', **kwargs)
727 def _dc_search_uploader(self, html):
728 return self._html_search_meta('dc.creator', html, 'uploader')
730 def _rta_search(self, html):
731 # See http://www.rtalabel.org/index.php?content=howtofaq#single
732 if re.search(r'(?ix)<meta\s+name="rating"\s+'
733 r' content="RTA-5042-1996-1400-1577-RTA"',
738 def _media_rating_search(self, html):
739 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
740 rating = self._html_search_meta('rating', html)
752 return RATING_TABLE.get(rating.lower())
754 def _family_friendly_search(self, html):
755 # See http://schema.org/VideoObject
756 family_friendly = self._html_search_meta('isFamilyFriendly', html)
758 if not family_friendly:
767 return RATING_TABLE.get(family_friendly.lower())
769 def _twitter_search_player(self, html):
770 return self._html_search_meta('twitter:player', html,
771 'twitter card player')
773 def _search_json_ld(self, html, video_id, **kwargs):
774 json_ld = self._search_regex(
775 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
776 html, 'JSON-LD', group='json_ld', **kwargs)
779 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
781 def _json_ld(self, json_ld, video_id, fatal=True):
782 if isinstance(json_ld, compat_str):
783 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
787 if json_ld.get('@context') == 'http://schema.org':
788 item_type = json_ld.get('@type')
789 if item_type == 'TVEpisode':
791 'episode': unescapeHTML(json_ld.get('name')),
792 'episode_number': int_or_none(json_ld.get('episodeNumber')),
793 'description': unescapeHTML(json_ld.get('description')),
795 part_of_season = json_ld.get('partOfSeason')
796 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
797 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
798 part_of_series = json_ld.get('partOfSeries')
799 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
800 info['series'] = unescapeHTML(part_of_series.get('name'))
801 elif item_type == 'Article':
803 'timestamp': parse_iso8601(json_ld.get('datePublished')),
804 'title': unescapeHTML(json_ld.get('headline')),
805 'description': unescapeHTML(json_ld.get('articleBody')),
807 return dict((k, v) for k, v in info.items() if v is not None)
810 def _hidden_inputs(html):
811 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
813 for input in re.findall(r'(?i)<input([^>]+)>', html):
814 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
816 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
819 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
822 hidden_inputs[name.group('value')] = value.group('value')
825 def _form_hidden_inputs(self, form_id, html):
826 form = self._search_regex(
827 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
828 html, '%s form' % form_id, group='form')
829 return self._hidden_inputs(form)
831 def _sort_formats(self, formats, field_preference=None):
833 raise ExtractorError('No video formats found')
836 # Automatically determine tbr when missing based on abr and vbr (improves
837 # formats sorting in some cases)
838 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
839 f['tbr'] = f['abr'] + f['vbr']
842 # TODO remove the following workaround
843 from ..utils import determine_ext
844 if not f.get('ext') and 'url' in f:
845 f['ext'] = determine_ext(f['url'])
847 if isinstance(field_preference, (list, tuple)):
848 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
850 preference = f.get('preference')
851 if preference is None:
853 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
856 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
858 if f.get('vcodec') == 'none': # audio only
859 if self._downloader.params.get('prefer_free_formats'):
860 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
862 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
865 audio_ext_preference = ORDER.index(f['ext'])
867 audio_ext_preference = -1
869 if self._downloader.params.get('prefer_free_formats'):
870 ORDER = ['flv', 'mp4', 'webm']
872 ORDER = ['webm', 'flv', 'mp4']
874 ext_preference = ORDER.index(f['ext'])
877 audio_ext_preference = 0
881 f.get('language_preference') if f.get('language_preference') is not None else -1,
882 f.get('quality') if f.get('quality') is not None else -1,
883 f.get('tbr') if f.get('tbr') is not None else -1,
884 f.get('filesize') if f.get('filesize') is not None else -1,
885 f.get('vbr') if f.get('vbr') is not None else -1,
886 f.get('height') if f.get('height') is not None else -1,
887 f.get('width') if f.get('width') is not None else -1,
890 f.get('abr') if f.get('abr') is not None else -1,
891 audio_ext_preference,
892 f.get('fps') if f.get('fps') is not None else -1,
893 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
894 f.get('source_preference') if f.get('source_preference') is not None else -1,
895 f.get('format_id') if f.get('format_id') is not None else '',
897 formats.sort(key=_formats_key)
899 def _check_formats(self, formats, video_id):
902 lambda f: self._is_valid_url(
904 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
908 def _remove_duplicate_formats(formats):
912 if f['url'] not in format_urls:
913 format_urls.add(f['url'])
914 unique_formats.append(f)
915 formats[:] = unique_formats
917 def _is_valid_url(self, url, video_id, item='video'):
918 url = self._proto_relative_url(url, scheme='http:')
919 # For now assume non HTTP(S) URLs always valid
920 if not (url.startswith('http://') or url.startswith('https://')):
923 self._request_webpage(url, video_id, 'Checking %s URL' % item)
925 except ExtractorError as e:
926 if isinstance(e.cause, compat_urllib_error.URLError):
928 '%s: %s URL is invalid, skipping' % (video_id, item))
932 def http_scheme(self):
933 """ Either "http:" or "https:", depending on the user's preferences """
936 if self._downloader.params.get('prefer_insecure', False)
939 def _proto_relative_url(self, url, scheme=None):
942 if url.startswith('//'):
944 scheme = self.http_scheme()
949 def _sleep(self, timeout, video_id, msg_template=None):
950 if msg_template is None:
951 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
952 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
956 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
957 transform_source=lambda s: fix_xml_ampersands(s).strip(),
959 manifest = self._download_xml(
960 manifest_url, video_id, 'Downloading f4m manifest',
961 'Unable to download f4m manifest',
962 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
963 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
964 transform_source=transform_source,
967 if manifest is False:
971 manifest_version = '1.0'
972 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
974 manifest_version = '2.0'
975 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
976 base_url = xpath_text(
977 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
978 'base URL', default=None)
980 base_url = base_url.strip()
981 for i, media_el in enumerate(media_nodes):
982 if manifest_version == '2.0':
983 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
987 media_url if media_url.startswith('http://') or media_url.startswith('https://')
988 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
989 # If media_url is itself a f4m manifest do the recursive extraction
990 # since bitrates in parent manifest (this one) and media_url manifest
991 # may differ leading to inability to resolve the format by requested
992 # bitrate in f4m downloader
993 if determine_ext(manifest_url) == 'f4m':
994 formats.extend(self._extract_f4m_formats(
995 manifest_url, video_id, preference, f4m_id, fatal=fatal))
997 tbr = int_or_none(media_el.attrib.get('bitrate'))
999 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1000 'url': manifest_url,
1003 'width': int_or_none(media_el.attrib.get('width')),
1004 'height': int_or_none(media_el.attrib.get('height')),
1005 'preference': preference,
1007 self._sort_formats(formats)
1011 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1012 entry_protocol='m3u8', preference=None,
1013 m3u8_id=None, note=None, errnote=None,
1017 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1021 'preference': preference - 1 if preference else -1,
1022 'resolution': 'multiple',
1023 'format_note': 'Quality selection URL',
1026 format_url = lambda u: (
1028 if re.match(r'^https?://', u)
1029 else compat_urlparse.urljoin(m3u8_url, u))
1031 res = self._download_webpage_handle(
1033 note=note or 'Downloading m3u8 information',
1034 errnote=errnote or 'Failed to download m3u8 information',
1038 m3u8_doc, urlh = res
1039 m3u8_url = urlh.geturl()
1041 # We should try extracting formats only from master playlists [1], i.e.
1042 # playlists that describe available qualities. On the other hand media
1043 # playlists [2] should be returned as is since they contain just the media
1044 # without qualities renditions.
1045 # Fortunately, master playlist can be easily distinguished from media
1046 # playlist based on particular tags availability. As of [1, 2] master
1047 # playlist tags MUST NOT appear in a media playist and vice versa.
1048 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1049 # and MUST NOT appear in master playlist thus we can clearly detect media
1050 # playlist with this criterion.
1051 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1052 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1053 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1054 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1057 'format_id': m3u8_id,
1059 'protocol': entry_protocol,
1060 'preference': preference,
1064 kv_rex = re.compile(
1065 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1066 for line in m3u8_doc.splitlines():
1067 if line.startswith('#EXT-X-STREAM-INF:'):
1069 for m in kv_rex.finditer(line):
1071 if v.startswith('"'):
1073 last_info[m.group('key')] = v
1074 elif line.startswith('#EXT-X-MEDIA:'):
1076 for m in kv_rex.finditer(line):
1078 if v.startswith('"'):
1080 last_media[m.group('key')] = v
1081 elif line.startswith('#') or not line.strip():
1084 if last_info is None:
1085 formats.append({'url': format_url(line)})
1087 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1090 format_id.append(m3u8_id)
1091 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1092 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1094 'format_id': '-'.join(format_id),
1095 'url': format_url(line.strip()),
1098 'protocol': entry_protocol,
1099 'preference': preference,
1101 resolution = last_info.get('RESOLUTION')
1103 width_str, height_str = resolution.split('x')
1104 f['width'] = int(width_str)
1105 f['height'] = int(height_str)
1106 codecs = last_info.get('CODECS')
1108 vcodec, acodec = [None] * 2
1109 va_codecs = codecs.split(',')
1110 if len(va_codecs) == 1:
1111 # Audio only entries usually come with single codec and
1112 # no resolution. For more robustness we also check it to
1114 if not resolution and va_codecs[0].startswith('mp4a'):
1115 vcodec, acodec = 'none', va_codecs[0]
1117 vcodec = va_codecs[0]
1119 vcodec, acodec = va_codecs[:2]
1124 if last_media is not None:
1125 f['m3u8_media'] = last_media
1129 self._sort_formats(formats)
1133 def _xpath_ns(path, namespace=None):
1137 for c in path.split('/'):
1138 if not c or c == '.':
1141 out.append('{%s}%s' % (namespace, c))
1142 return '/'.join(out)
1144 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1145 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1151 namespace = self._parse_smil_namespace(smil)
1153 return self._parse_smil_formats(
1154 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1156 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1157 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1160 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1162 def _download_smil(self, smil_url, video_id, fatal=True):
1163 return self._download_xml(
1164 smil_url, video_id, 'Downloading SMIL file',
1165 'Unable to download SMIL file', fatal=fatal)
1167 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1168 namespace = self._parse_smil_namespace(smil)
1170 formats = self._parse_smil_formats(
1171 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1172 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1174 video_id = os.path.splitext(url_basename(smil_url))[0]
1178 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179 name = meta.attrib.get('name')
1180 content = meta.attrib.get('content')
1181 if not name or not content:
1183 if not title and name == 'title':
1185 elif not description and name in ('description', 'abstract'):
1186 description = content
1187 elif not upload_date and name == 'date':
1188 upload_date = unified_strdate(content)
1191 'id': image.get('type'),
1192 'url': image.get('src'),
1193 'width': int_or_none(image.get('width')),
1194 'height': int_or_none(image.get('height')),
1195 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1199 'title': title or video_id,
1200 'description': description,
1201 'upload_date': upload_date,
1202 'thumbnails': thumbnails,
1204 'subtitles': subtitles,
1207 def _parse_smil_namespace(self, smil):
1208 return self._search_regex(
1209 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1211 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1213 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1214 b = meta.get('base') or meta.get('httpBase')
1225 videos = smil.findall(self._xpath_ns('.//video', namespace))
1226 for video in videos:
1227 src = video.get('src')
1228 if not src or src in srcs:
1232 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1233 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1234 width = int_or_none(video.get('width'))
1235 height = int_or_none(video.get('height'))
1236 proto = video.get('proto')
1237 ext = video.get('ext')
1238 src_ext = determine_ext(src)
1239 streamer = video.get('streamer') or base
1241 if proto == 'rtmp' or streamer.startswith('rtmp'):
1247 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1249 'filesize': filesize,
1253 if transform_rtmp_url:
1254 streamer, src = transform_rtmp_url(streamer, src)
1255 formats[-1].update({
1261 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1262 src_url = src_url.strip()
1264 if proto == 'm3u8' or src_ext == 'm3u8':
1265 m3u8_formats = self._extract_m3u8_formats(
1266 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1267 if len(m3u8_formats) == 1:
1269 m3u8_formats[0].update({
1270 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1275 formats.extend(m3u8_formats)
1278 if src_ext == 'f4m':
1283 'plugin': 'flowplayer-3.2.0.1',
1285 f4m_url += '&' if '?' in f4m_url else '?'
1286 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1287 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1290 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1294 'ext': ext or src_ext or 'flv',
1295 'format_id': 'http-%d' % (bitrate or http_count),
1297 'filesize': filesize,
1303 self._sort_formats(formats)
1307 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1310 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1311 src = textstream.get('src')
1312 if not src or src in urls:
1315 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1316 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1317 subtitles.setdefault(lang, []).append({
1323 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1324 xspf = self._download_xml(
1325 playlist_url, playlist_id, 'Downloading xpsf playlist',
1326 'Unable to download xspf manifest', fatal=fatal)
1329 return self._parse_xspf(xspf, playlist_id)
1331 def _parse_xspf(self, playlist, playlist_id):
1333 'xspf': 'http://xspf.org/ns/0/',
1334 's1': 'http://static.streamone.nl/player/ns/0',
1338 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1340 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1341 description = xpath_text(
1342 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1343 thumbnail = xpath_text(
1344 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1345 duration = float_or_none(
1346 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1349 'url': location.text,
1350 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1351 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1352 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1353 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1354 self._sort_formats(formats)
1359 'description': description,
1360 'thumbnail': thumbnail,
1361 'duration': duration,
1366 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1367 res = self._download_webpage_handle(
1369 note=note or 'Downloading MPD manifest',
1370 errnote=errnote or 'Failed to download MPD manifest',
1375 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1377 return self._parse_mpd_formats(
1378 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1380 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1381 if mpd_doc.get('type') == 'dynamic':
1384 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1387 return self._xpath_ns(path, namespace)
1389 def is_drm_protected(element):
1390 return element.find(_add_ns('ContentProtection')) is not None
1392 def extract_multisegment_info(element, ms_parent_info):
1393 ms_info = ms_parent_info.copy()
1394 segment_list = element.find(_add_ns('SegmentList'))
1395 if segment_list is not None:
1396 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1398 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1399 initialization = segment_list.find(_add_ns('Initialization'))
1400 if initialization is not None:
1401 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1403 segment_template = element.find(_add_ns('SegmentTemplate'))
1404 if segment_template is not None:
1405 start_number = segment_template.get('startNumber')
1407 ms_info['start_number'] = int(start_number)
1408 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1409 if segment_timeline is not None:
1410 s_e = segment_timeline.findall(_add_ns('S'))
1412 ms_info['total_number'] = 0
1414 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1416 timescale = segment_template.get('timescale')
1418 ms_info['timescale'] = int(timescale)
1419 segment_duration = segment_template.get('duration')
1420 if segment_duration:
1421 ms_info['segment_duration'] = int(segment_duration)
1422 media_template = segment_template.get('media')
1424 ms_info['media_template'] = media_template
1425 initialization = segment_template.get('initialization')
1427 ms_info['initialization_url'] = initialization
1429 initialization = segment_template.find(_add_ns('Initialization'))
1430 if initialization is not None:
1431 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1434 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1436 for period in mpd_doc.findall(_add_ns('Period')):
1437 period_duration = parse_duration(period.get('duration')) or mpd_duration
1438 period_ms_info = extract_multisegment_info(period, {
1442 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1443 if is_drm_protected(adaptation_set):
1445 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1446 for representation in adaptation_set.findall(_add_ns('Representation')):
1447 if is_drm_protected(representation):
1449 representation_attrib = adaptation_set.attrib.copy()
1450 representation_attrib.update(representation.attrib)
1451 mime_type = representation_attrib.get('mimeType')
1452 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1453 if content_type == 'text':
1454 # TODO implement WebVTT downloading
1456 elif content_type == 'video' or content_type == 'audio':
1458 for element in (representation, adaptation_set, period, mpd_doc):
1459 base_url_e = element.find(_add_ns('BaseURL'))
1460 if base_url_e is not None:
1461 base_url = base_url_e.text + base_url
1462 if re.match(r'^https?://', base_url):
1464 if mpd_base_url and not re.match(r'^https?://', base_url):
1465 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1467 base_url = mpd_base_url + base_url
1468 representation_id = representation_attrib.get('id')
1469 lang = representation_attrib.get('lang')
1470 url_el = representation.find(_add_ns('BaseURL'))
1471 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1473 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1475 'ext': codec2ext(representation_attrib.get('codecs')),
1476 'width': int_or_none(representation_attrib.get('width')),
1477 'height': int_or_none(representation_attrib.get('height')),
1478 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1479 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1480 'fps': int_or_none(representation_attrib.get('frameRate')),
1481 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1482 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1483 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1484 'format_note': 'DASH %s' % content_type,
1485 'filesize': filesize,
1487 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1488 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1489 if 'total_number' not in representation_ms_info and 'segment_duration':
1490 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1491 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1492 media_template = representation_ms_info['media_template']
1493 media_template = media_template.replace('$RepresentationID$', representation_id)
1494 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1495 media_template.replace('$$', '$')
1496 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1497 if 'segment_urls' in representation_ms_info:
1499 'segment_urls': representation_ms_info['segment_urls'],
1500 'protocol': 'http_dash_segments',
1502 if 'initialization_url' in representation_ms_info:
1503 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1505 'initialization_url': initialization_url,
1507 if not f.get('url'):
1508 f['url'] = initialization_url
1510 existing_format = next(
1511 fo for fo in formats
1512 if fo['format_id'] == representation_id)
1513 except StopIteration:
1514 full_info = formats_dict.get(representation_id, {}).copy()
1516 formats.append(full_info)
1518 existing_format.update(f)
1520 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1521 self._sort_formats(formats)
1524 def _live_title(self, name):
1525 """ Generate the title for a live video """
1526 now = datetime.datetime.now()
1527 now_str = now.strftime('%Y-%m-%d %H:%M')
1528 return name + ' ' + now_str
1530 def _int(self, v, name, fatal=False, **kwargs):
1531 res = int_or_none(v, **kwargs)
1532 if 'get_attr' in kwargs:
1533 print(getattr(v, kwargs['get_attr']))
1535 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1537 raise ExtractorError(msg)
1539 self._downloader.report_warning(msg)
1542 def _float(self, v, name, fatal=False, **kwargs):
1543 res = float_or_none(v, **kwargs)
1545 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1547 raise ExtractorError(msg)
1549 self._downloader.report_warning(msg)
1552 def _set_cookie(self, domain, name, value, expire_time=None):
1553 cookie = compat_cookiejar.Cookie(
1554 0, name, value, None, None, domain, None,
1555 None, '/', True, False, expire_time, '', None, None, None)
1556 self._downloader.cookiejar.set_cookie(cookie)
1558 def _get_cookies(self, url):
1559 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1560 req = sanitized_Request(url)
1561 self._downloader.cookiejar.add_cookie_header(req)
1562 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1564 def get_testcases(self, include_onlymatching=False):
1565 t = getattr(self, '_TEST', None)
1567 assert not hasattr(self, '_TESTS'), \
1568 '%s has _TEST and _TESTS' % type(self).__name__
1571 tests = getattr(self, '_TESTS', [])
1573 if not include_onlymatching and t.get('only_matching', False):
1575 t['name'] = type(self).__name__[:-len('IE')]
1578 def is_suitable(self, age_limit):
1579 """ Test whether the extractor is generally suitable for the given
1580 age limit (i.e. pornographic sites are not, all others usually are) """
1582 any_restricted = False
1583 for tc in self.get_testcases(include_onlymatching=False):
1584 if 'playlist' in tc:
1585 tc = tc['playlist'][0]
1586 is_restricted = age_restricted(
1587 tc.get('info_dict', {}).get('age_limit'), age_limit)
1588 if not is_restricted:
1590 any_restricted = any_restricted or is_restricted
1591 return not any_restricted
1593 def extract_subtitles(self, *args, **kwargs):
1594 if (self._downloader.params.get('writesubtitles', False) or
1595 self._downloader.params.get('listsubtitles')):
1596 return self._get_subtitles(*args, **kwargs)
1599 def _get_subtitles(self, *args, **kwargs):
1600 raise NotImplementedError('This method must be implemented by subclasses')
1603 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1604 """ Merge subtitle items for one language. Items with duplicated URLs
1605 will be dropped. """
1606 list1_urls = set([item['url'] for item in subtitle_list1])
1607 ret = list(subtitle_list1)
1608 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1612 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1613 """ Merge two subtitle dictionaries, language by language. """
1614 ret = dict(subtitle_dict1)
1615 for lang in subtitle_dict2:
1616 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1619 def extract_automatic_captions(self, *args, **kwargs):
1620 if (self._downloader.params.get('writeautomaticsub', False) or
1621 self._downloader.params.get('listsubtitles')):
1622 return self._get_automatic_captions(*args, **kwargs)
1625 def _get_automatic_captions(self, *args, **kwargs):
1626 raise NotImplementedError('This method must be implemented by subclasses')
1628 def mark_watched(self, *args, **kwargs):
1629 if (self._downloader.params.get('mark_watched', False) and
1630 (self._get_login_info()[0] is not None or
1631 self._downloader.params.get('cookiefile') is not None)):
1632 self._mark_watched(*args, **kwargs)
1634 def _mark_watched(self, *args, **kwargs):
1635 raise NotImplementedError('This method must be implemented by subclasses')
1638 class SearchInfoExtractor(InfoExtractor):
1640 Base class for paged search queries extractors.
1641 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1642 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1646 def _make_valid_url(cls):
1647 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1650 def suitable(cls, url):
1651 return re.match(cls._make_valid_url(), url) is not None
1653 def _real_extract(self, query):
1654 mobj = re.match(self._make_valid_url(), query)
1656 raise ExtractorError('Invalid search query "%s"' % query)
1658 prefix = mobj.group('prefix')
1659 query = mobj.group('query')
1661 return self._get_n_results(query, 1)
1662 elif prefix == 'all':
1663 return self._get_n_results(query, self._MAX_RESULTS)
1667 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1668 elif n > self._MAX_RESULTS:
1669 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1670 n = self._MAX_RESULTS
1671 return self._get_n_results(query, n)
1673 def _get_n_results(self, query, n):
1674 """Get a specified number of results for a query"""
1675 raise NotImplementedError('This method must be implemented by subclasses')
1678 def SEARCH_KEY(self):
1679 return self._SEARCH_KEY