1 from __future__ import unicode_literals
15 from ..compat import (
24 compat_etree_fromstring,
53 class InfoExtractor(object):
54 """Information Extractor class.
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
60 passed to the YoutubeDL. The YoutubeDL processes this
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
64 The type field determines the type of the result.
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
68 For a video, the dictionaries must include the following fields:
71 title: Video title, unescaped.
73 Additionally, it must contain either a formats entry or a url one:
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
79 * url Mandatory. The URL of the video file
80 * ext Will be calculated from URL if missing
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
92 * resolution Textual description of width and height
93 * tbr Average bitrate of audio and video in KBit/s
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
96 * asr Audio sampling rate in Hertz
97 * vbr Average video bitrate in KBit/s
99 * vcodec Name of the video codec in use
100 * container Name of the container format
101 * filesize The number of bytes, if known in advance
102 * filesize_approx An estimate for the number of bytes
103 * player_url SWF Player URL (used for rtmpdump).
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
108 * preference Order number of this format. If this field is
109 present and not None, the formats get sorted
110 by this field, regardless of all other values.
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
131 * stretched_ratio If given and not 1, indicates that the
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
137 url: Final video URL.
138 ext: Video filename extension.
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
142 The following fields are optional:
144 alt_title: A secondary title of the video.
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
149 thumbnails: A list of dictionaries, with the following entries:
150 * "id" (optional, string) - Thumbnail format ID
152 * "preference" (optional, int) - quality of the image
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
157 thumbnail: Full URL to a video thumbnail image.
158 description: Full video description.
159 uploader: Full name of the video uploader.
160 license: License name the video is licensed under.
161 creator: The main artist who created the video.
162 release_date: The date (YYYYMMDD) when the video was released.
163 timestamp: UNIX timestamp of the moment the video became available.
164 upload_date: Video upload date (YYYYMMDD).
165 If not explicitly set, calculated from timestamp.
166 uploader_id: Nickname or id of the video uploader.
167 uploader_url: Full URL to a personal webpage of the video uploader.
168 location: Physical location where the video was filmed.
169 subtitles: The available subtitles as a dictionary in the format
170 {language: subformats}. "subformats" is a list sorted from
171 lower to higher preference, each element is a dictionary
172 with the "ext" entry and one of:
173 * "data": The subtitles file contents
174 * "url": A URL pointing to the subtitles file
175 "ext" will be calculated from URL if missing
176 automatic_captions: Like 'subtitles', used by the YoutubeIE for
177 automatically generated captions
178 duration: Length of the video in seconds, as an integer or float.
179 view_count: How many users have watched the video on the platform.
180 like_count: Number of positive ratings of the video
181 dislike_count: Number of negative ratings of the video
182 repost_count: Number of reposts of the video
183 average_rating: Average rating give by users, the scale used depends on the webpage
184 comment_count: Number of comments on the video
185 comments: A list of comments, each with one or more of the following
186 properties (all but one of text or html optional):
187 * "author" - human-readable name of the comment author
188 * "author_id" - user ID of the comment author
190 * "html" - Comment as HTML
191 * "text" - Plain text of the comment
192 * "timestamp" - UNIX timestamp of comment
193 * "parent" - ID of the comment this one is replying to.
194 Set to "root" to indicate that this is a
195 comment to the original video.
196 age_limit: Age restriction for the video, as an integer (years)
197 webpage_url: The URL to the video webpage, if given to youtube-dl it
198 should allow to get the same result again. (It will be set
199 by YoutubeDL if it's missing)
200 categories: A list of categories that the video falls in, for example
202 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
203 is_live: True, False, or None (=unknown). Whether this video is a
204 live stream that goes on instead of a fixed-length video.
205 start_time: Time in seconds where the reproduction should start, as
206 specified in the URL.
207 end_time: Time in seconds where the reproduction should end, as
208 specified in the URL.
210 The following fields should only be used when the video belongs to some logical
213 chapter: Name or title of the chapter the video belongs to.
214 chapter_number: Number of the chapter the video belongs to, as an integer.
215 chapter_id: Id of the chapter the video belongs to, as a unicode string.
217 The following fields should only be used when the video is an episode of some
220 series: Title of the series or programme the video episode belongs to.
221 season: Title of the season the video episode belongs to.
222 season_number: Number of the season the video episode belongs to, as an integer.
223 season_id: Id of the season the video episode belongs to, as a unicode string.
224 episode: Title of the video episode. Unlike mandatory video title field,
225 this field should denote the exact title of the video episode
226 without any kind of decoration.
227 episode_number: Number of the video episode within a season, as an integer.
228 episode_id: Id of the video episode, as a unicode string.
230 Unless mentioned otherwise, the fields should be Unicode strings.
232 Unless mentioned otherwise, None is equivalent to absence of information.
235 _type "playlist" indicates multiple videos.
236 There must be a key "entries", which is a list, an iterable, or a PagedList
237 object, each element of which is a valid dictionary by this specification.
239 Additionally, playlists can have "title", "description" and "id" attributes
240 with the same semantics as videos (see above).
243 _type "multi_video" indicates that there are multiple videos that
244 form a single show, for examples multiple acts of an opera or TV episode.
245 It must have an entries key like a playlist and contain all the keys
246 required for a video at the same time.
249 _type "url" indicates that the video must be extracted from another
250 location, possibly by a different extractor. Its only required key is:
251 "url" - the next URL to extract.
252 The key "ie_key" can be set to the class name (minus the trailing "IE",
253 e.g. "Youtube") if the extractor class is known in advance.
254 Additionally, the dictionary may have any properties of the resolved entity
255 known in advance, for example "title" if the title of the referred video is
259 _type "url_transparent" entities have the same specification as "url", but
260 indicate that the given additional information is more precise than the one
261 associated with the resolved URL.
262 This is useful when a site employs a video service that hosts the video and
263 its technical metadata, but that video service does not embed a useful
264 title, description etc.
267 Subclasses of this one should re-define the _real_initialize() and
268 _real_extract() methods and define a _VALID_URL regexp.
269 Probably, they should also be added to the list of extractors.
271 Finally, the _WORKING attribute should be set to False for broken IEs
272 in order to warn the users and skip the tests.
279 def __init__(self, downloader=None):
280 """Constructor. Receives an optional downloader."""
282 self.set_downloader(downloader)
285 def suitable(cls, url):
286 """Receives a URL and returns True if suitable for this IE."""
288 # This does not use has/getattr intentionally - we want to know whether
289 # we have cached the regexp for *this* class, whereas getattr would also
290 # match the superclass
291 if '_VALID_URL_RE' not in cls.__dict__:
292 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
293 return cls._VALID_URL_RE.match(url) is not None
296 def _match_id(cls, url):
297 if '_VALID_URL_RE' not in cls.__dict__:
298 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
299 m = cls._VALID_URL_RE.match(url)
305 """Getter method for _WORKING."""
308 def initialize(self):
309 """Initializes an instance (authentication, etc)."""
311 self._real_initialize()
314 def extract(self, url):
315 """Extracts URL information and returns it in list of dicts."""
318 return self._real_extract(url)
319 except ExtractorError:
321 except compat_http_client.IncompleteRead as e:
322 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
323 except (KeyError, StopIteration) as e:
324 raise ExtractorError('An extractor error has occurred.', cause=e)
326 def set_downloader(self, downloader):
327 """Sets the downloader for this IE."""
328 self._downloader = downloader
330 def _real_initialize(self):
331 """Real initialization process. Redefine in subclasses."""
334 def _real_extract(self, url):
335 """Real extraction process. Redefine in subclasses."""
340 """A string for getting the InfoExtractor with get_info_extractor"""
341 return compat_str(cls.__name__[:-2])
345 return compat_str(type(self).__name__[:-2])
347 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
348 """ Returns the response handle """
350 self.report_download_webpage(video_id)
351 elif note is not False:
353 self.to_screen('%s' % (note,))
355 self.to_screen('%s: %s' % (video_id, note))
357 return self._downloader.urlopen(url_or_request)
358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
362 errnote = 'Unable to download webpage'
364 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
366 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
368 self._downloader.report_warning(errmsg)
371 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
372 """ Returns a tuple (page content as string, URL handle) """
373 # Strip hashes from the URL (#1038)
374 if isinstance(url_or_request, (compat_str, str)):
375 url_or_request = url_or_request.partition('#')[0]
377 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
381 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
382 return (content, urlh)
385 def _guess_encoding_from_content(content_type, webpage_bytes):
386 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
388 encoding = m.group(1)
390 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
391 webpage_bytes[:1024])
393 encoding = m.group(1).decode('ascii')
394 elif webpage_bytes.startswith(b'\xff\xfe'):
401 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
402 content_type = urlh.headers.get('Content-Type', '')
403 webpage_bytes = urlh.read()
404 if prefix is not None:
405 webpage_bytes = prefix + webpage_bytes
407 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
408 if self._downloader.params.get('dump_intermediate_pages', False):
410 url = url_or_request.get_full_url()
411 except AttributeError:
413 self.to_screen('Dumping request to ' + url)
414 dump = base64.b64encode(webpage_bytes).decode('ascii')
415 self._downloader.to_screen(dump)
416 if self._downloader.params.get('write_pages', False):
418 url = url_or_request.get_full_url()
419 except AttributeError:
421 basen = '%s_%s' % (video_id, url)
423 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
424 basen = basen[:240 - len(h)] + h
425 raw_filename = basen + '.dump'
426 filename = sanitize_filename(raw_filename, restricted=True)
427 self.to_screen('Saving request to ' + filename)
428 # Working around MAX_PATH limitation on Windows (see
429 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
431 absfilepath = os.path.abspath(filename)
432 if len(absfilepath) > 259:
433 filename = '\\\\?\\' + absfilepath
434 with open(filename, 'wb') as outf:
435 outf.write(webpage_bytes)
438 content = webpage_bytes.decode(encoding, 'replace')
440 content = webpage_bytes.decode('utf-8', 'replace')
442 if ('<title>Access to this site is blocked</title>' in content and
443 'Websense' in content[:512]):
444 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
445 blocked_iframe = self._html_search_regex(
446 r'<iframe src="([^"]+)"', content,
447 'Websense information URL', default=None)
449 msg += ' Visit %s for more details' % blocked_iframe
450 raise ExtractorError(msg, expected=True)
451 if '<title>The URL you requested has been blocked</title>' in content[:512]:
453 'Access to this webpage has been blocked by Indian censorship. '
454 'Use a VPN or proxy server (with --proxy) to route around it.')
455 block_msg = self._html_search_regex(
456 r'</h1><p>(.*?)</p>',
457 content, 'block message', default=None)
459 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
460 raise ExtractorError(msg, expected=True)
464 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
465 """ Returns the data of the page as a string """
468 while success is False:
470 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
472 except compat_http_client.IncompleteRead as e:
474 if try_count >= tries:
476 self._sleep(timeout, video_id)
483 def _download_xml(self, url_or_request, video_id,
484 note='Downloading XML', errnote='Unable to download XML',
485 transform_source=None, fatal=True, encoding=None):
486 """Return the xml as an xml.etree.ElementTree.Element"""
487 xml_string = self._download_webpage(
488 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
489 if xml_string is False:
492 xml_string = transform_source(xml_string)
493 return compat_etree_fromstring(xml_string.encode('utf-8'))
495 def _download_json(self, url_or_request, video_id,
496 note='Downloading JSON metadata',
497 errnote='Unable to download JSON metadata',
498 transform_source=None,
499 fatal=True, encoding=None):
500 json_string = self._download_webpage(
501 url_or_request, video_id, note, errnote, fatal=fatal,
503 if (not fatal) and json_string is False:
505 return self._parse_json(
506 json_string, video_id, transform_source=transform_source, fatal=fatal)
508 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
510 json_string = transform_source(json_string)
512 return json.loads(json_string)
513 except ValueError as ve:
514 errmsg = '%s: Failed to parse JSON ' % video_id
516 raise ExtractorError(errmsg, cause=ve)
518 self.report_warning(errmsg + str(ve))
520 def update_url_params(self, url, params):
521 parsed_url = compat_urlparse.urlparse(url)
522 qs = compat_urlparse.parse_qs(parsed_url.query)
524 return compat_urlparse.urlunparse(
525 parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
527 def report_warning(self, msg, video_id=None):
528 idstr = '' if video_id is None else '%s: ' % video_id
529 self._downloader.report_warning(
530 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
532 def to_screen(self, msg):
533 """Print msg to screen, prefixing it with '[ie_name]'"""
534 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
536 def report_extraction(self, id_or_name):
537 """Report information extraction."""
538 self.to_screen('%s: Extracting information' % id_or_name)
540 def report_download_webpage(self, video_id):
541 """Report webpage download."""
542 self.to_screen('%s: Downloading webpage' % video_id)
544 def report_age_confirmation(self):
545 """Report attempt to confirm age."""
546 self.to_screen('Confirming age')
548 def report_login(self):
549 """Report attempt to log in."""
550 self.to_screen('Logging in')
553 def raise_login_required(msg='This video is only available for registered users'):
554 raise ExtractorError(
555 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
559 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
560 raise ExtractorError(
561 '%s. You might want to use --proxy to workaround.' % msg,
564 # Methods for following #608
566 def url_result(url, ie=None, video_id=None, video_title=None):
567 """Returns a URL that points to a page that should be processed"""
568 # TODO: ie should be the class used for getting the info
569 video_info = {'_type': 'url',
572 if video_id is not None:
573 video_info['id'] = video_id
574 if video_title is not None:
575 video_info['title'] = video_title
579 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
580 """Returns a playlist"""
581 video_info = {'_type': 'playlist',
584 video_info['id'] = playlist_id
586 video_info['title'] = playlist_title
587 if playlist_description:
588 video_info['description'] = playlist_description
591 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
593 Perform a regex search on the given string, using a single or a list of
594 patterns returning the first matching group.
595 In case of failure return a default value or raise a WARNING or a
596 RegexNotFoundError, depending on fatal, specifying the field name.
598 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
599 mobj = re.search(pattern, string, flags)
602 mobj = re.search(p, string, flags)
606 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
607 _name = '\033[0;34m%s\033[0m' % name
613 # return the first matching group
614 return next(g for g in mobj.groups() if g is not None)
616 return mobj.group(group)
617 elif default is not NO_DEFAULT:
620 raise RegexNotFoundError('Unable to extract %s' % _name)
622 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
625 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
627 Like _search_regex, but strips HTML tags and unescapes entities.
629 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
631 return clean_html(res).strip()
635 def _get_login_info(self):
637 Get the login info as (username, password)
638 It will look in the netrc file using the _NETRC_MACHINE value
639 If there's no info available, return (None, None)
641 if self._downloader is None:
646 downloader_params = self._downloader.params
648 # Attempt to use provided username and password or .netrc data
649 if downloader_params.get('username') is not None:
650 username = downloader_params['username']
651 password = downloader_params['password']
652 elif downloader_params.get('usenetrc', False):
654 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
659 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
660 except (IOError, netrc.NetrcParseError) as err:
661 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
663 return (username, password)
665 def _get_tfa_info(self, note='two-factor verification code'):
667 Get the two-factor authentication info
668 TODO - asking the user will be required for sms/phone verify
669 currently just uses the command line option
670 If there's no info available, return None
672 if self._downloader is None:
674 downloader_params = self._downloader.params
676 if downloader_params.get('twofactor') is not None:
677 return downloader_params['twofactor']
679 return compat_getpass('Type %s and press [Return]: ' % note)
681 # Helper functions for extracting OpenGraph info
683 def _og_regexes(prop):
684 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
685 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
686 % {'prop': re.escape(prop)})
687 template = r'<meta[^>]+?%s[^>]+?%s'
689 template % (property_re, content_re),
690 template % (content_re, property_re),
694 def _meta_regex(prop):
695 return r'''(?isx)<meta
696 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
697 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
699 def _og_search_property(self, prop, html, name=None, **kargs):
701 name = 'OpenGraph %s' % prop
702 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
705 return unescapeHTML(escaped)
707 def _og_search_thumbnail(self, html, **kargs):
708 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
710 def _og_search_description(self, html, **kargs):
711 return self._og_search_property('description', html, fatal=False, **kargs)
713 def _og_search_title(self, html, **kargs):
714 return self._og_search_property('title', html, **kargs)
716 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
717 regexes = self._og_regexes('video') + self._og_regexes('video:url')
719 regexes = self._og_regexes('video:secure_url') + regexes
720 return self._html_search_regex(regexes, html, name, **kargs)
722 def _og_search_url(self, html, **kargs):
723 return self._og_search_property('url', html, **kargs)
725 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
726 if display_name is None:
728 return self._html_search_regex(
729 self._meta_regex(name),
730 html, display_name, fatal=fatal, group='content', **kwargs)
732 def _dc_search_uploader(self, html):
733 return self._html_search_meta('dc.creator', html, 'uploader')
735 def _rta_search(self, html):
736 # See http://www.rtalabel.org/index.php?content=howtofaq#single
737 if re.search(r'(?ix)<meta\s+name="rating"\s+'
738 r' content="RTA-5042-1996-1400-1577-RTA"',
743 def _media_rating_search(self, html):
744 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
745 rating = self._html_search_meta('rating', html)
757 return RATING_TABLE.get(rating.lower())
759 def _family_friendly_search(self, html):
760 # See http://schema.org/VideoObject
761 family_friendly = self._html_search_meta('isFamilyFriendly', html)
763 if not family_friendly:
772 return RATING_TABLE.get(family_friendly.lower())
774 def _twitter_search_player(self, html):
775 return self._html_search_meta('twitter:player', html,
776 'twitter card player')
778 def _search_json_ld(self, html, video_id, **kwargs):
779 json_ld = self._search_regex(
780 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
781 html, 'JSON-LD', group='json_ld', **kwargs)
784 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
786 def _json_ld(self, json_ld, video_id, fatal=True):
787 if isinstance(json_ld, compat_str):
788 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
792 if json_ld.get('@context') == 'http://schema.org':
793 item_type = json_ld.get('@type')
794 if item_type == 'TVEpisode':
796 'episode': unescapeHTML(json_ld.get('name')),
797 'episode_number': int_or_none(json_ld.get('episodeNumber')),
798 'description': unescapeHTML(json_ld.get('description')),
800 part_of_season = json_ld.get('partOfSeason')
801 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
802 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
803 part_of_series = json_ld.get('partOfSeries')
804 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
805 info['series'] = unescapeHTML(part_of_series.get('name'))
806 elif item_type == 'Article':
808 'timestamp': parse_iso8601(json_ld.get('datePublished')),
809 'title': unescapeHTML(json_ld.get('headline')),
810 'description': unescapeHTML(json_ld.get('articleBody')),
812 return dict((k, v) for k, v in info.items() if v is not None)
815 def _hidden_inputs(html):
816 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
818 for input in re.findall(r'(?i)<input([^>]+)>', html):
819 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
821 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
824 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
827 hidden_inputs[name.group('value')] = value.group('value')
830 def _form_hidden_inputs(self, form_id, html):
831 form = self._search_regex(
832 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
833 html, '%s form' % form_id, group='form')
834 return self._hidden_inputs(form)
836 def _sort_formats(self, formats, field_preference=None):
838 raise ExtractorError('No video formats found')
841 # Automatically determine tbr when missing based on abr and vbr (improves
842 # formats sorting in some cases)
843 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
844 f['tbr'] = f['abr'] + f['vbr']
847 # TODO remove the following workaround
848 from ..utils import determine_ext
849 if not f.get('ext') and 'url' in f:
850 f['ext'] = determine_ext(f['url'])
852 if isinstance(field_preference, (list, tuple)):
853 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
855 preference = f.get('preference')
856 if preference is None:
858 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
861 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
863 if f.get('vcodec') == 'none': # audio only
864 if self._downloader.params.get('prefer_free_formats'):
865 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
867 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
870 audio_ext_preference = ORDER.index(f['ext'])
872 audio_ext_preference = -1
874 if self._downloader.params.get('prefer_free_formats'):
875 ORDER = ['flv', 'mp4', 'webm']
877 ORDER = ['webm', 'flv', 'mp4']
879 ext_preference = ORDER.index(f['ext'])
882 audio_ext_preference = 0
886 f.get('language_preference') if f.get('language_preference') is not None else -1,
887 f.get('quality') if f.get('quality') is not None else -1,
888 f.get('tbr') if f.get('tbr') is not None else -1,
889 f.get('filesize') if f.get('filesize') is not None else -1,
890 f.get('vbr') if f.get('vbr') is not None else -1,
891 f.get('height') if f.get('height') is not None else -1,
892 f.get('width') if f.get('width') is not None else -1,
895 f.get('abr') if f.get('abr') is not None else -1,
896 audio_ext_preference,
897 f.get('fps') if f.get('fps') is not None else -1,
898 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
899 f.get('source_preference') if f.get('source_preference') is not None else -1,
900 f.get('format_id') if f.get('format_id') is not None else '',
902 formats.sort(key=_formats_key)
904 def _check_formats(self, formats, video_id):
907 lambda f: self._is_valid_url(
909 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
913 def _remove_duplicate_formats(formats):
917 if f['url'] not in format_urls:
918 format_urls.add(f['url'])
919 unique_formats.append(f)
920 formats[:] = unique_formats
922 def _is_valid_url(self, url, video_id, item='video'):
923 url = self._proto_relative_url(url, scheme='http:')
924 # For now assume non HTTP(S) URLs always valid
925 if not (url.startswith('http://') or url.startswith('https://')):
928 self._request_webpage(url, video_id, 'Checking %s URL' % item)
930 except ExtractorError as e:
931 if isinstance(e.cause, compat_urllib_error.URLError):
933 '%s: %s URL is invalid, skipping' % (video_id, item))
937 def http_scheme(self):
938 """ Either "http:" or "https:", depending on the user's preferences """
941 if self._downloader.params.get('prefer_insecure', False)
944 def _proto_relative_url(self, url, scheme=None):
947 if url.startswith('//'):
949 scheme = self.http_scheme()
954 def _sleep(self, timeout, video_id, msg_template=None):
955 if msg_template is None:
956 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
957 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
961 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
962 transform_source=lambda s: fix_xml_ampersands(s).strip(),
964 manifest = self._download_xml(
965 manifest_url, video_id, 'Downloading f4m manifest',
966 'Unable to download f4m manifest',
967 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
968 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
969 transform_source=transform_source,
972 if manifest is False:
976 manifest_version = '1.0'
977 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
979 manifest_version = '2.0'
980 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
981 base_url = xpath_text(
982 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
983 'base URL', default=None)
985 base_url = base_url.strip()
986 for i, media_el in enumerate(media_nodes):
987 if manifest_version == '2.0':
988 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
992 media_url if media_url.startswith('http://') or media_url.startswith('https://')
993 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
994 # If media_url is itself a f4m manifest do the recursive extraction
995 # since bitrates in parent manifest (this one) and media_url manifest
996 # may differ leading to inability to resolve the format by requested
997 # bitrate in f4m downloader
998 if determine_ext(manifest_url) == 'f4m':
999 formats.extend(self._extract_f4m_formats(
1000 manifest_url, video_id, preference, f4m_id, fatal=fatal))
1002 tbr = int_or_none(media_el.attrib.get('bitrate'))
1004 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1005 'url': manifest_url,
1008 'width': int_or_none(media_el.attrib.get('width')),
1009 'height': int_or_none(media_el.attrib.get('height')),
1010 'preference': preference,
1012 self._sort_formats(formats)
1016 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1017 entry_protocol='m3u8', preference=None,
1018 m3u8_id=None, note=None, errnote=None,
1022 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1026 'preference': preference - 1 if preference else -1,
1027 'resolution': 'multiple',
1028 'format_note': 'Quality selection URL',
1031 format_url = lambda u: (
1033 if re.match(r'^https?://', u)
1034 else compat_urlparse.urljoin(m3u8_url, u))
1036 res = self._download_webpage_handle(
1038 note=note or 'Downloading m3u8 information',
1039 errnote=errnote or 'Failed to download m3u8 information',
1043 m3u8_doc, urlh = res
1044 m3u8_url = urlh.geturl()
1046 # We should try extracting formats only from master playlists [1], i.e.
1047 # playlists that describe available qualities. On the other hand media
1048 # playlists [2] should be returned as is since they contain just the media
1049 # without qualities renditions.
1050 # Fortunately, master playlist can be easily distinguished from media
1051 # playlist based on particular tags availability. As of [1, 2] master
1052 # playlist tags MUST NOT appear in a media playist and vice versa.
1053 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1054 # and MUST NOT appear in master playlist thus we can clearly detect media
1055 # playlist with this criterion.
1056 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1057 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1058 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1059 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1062 'format_id': m3u8_id,
1064 'protocol': entry_protocol,
1065 'preference': preference,
1069 kv_rex = re.compile(
1070 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1071 for line in m3u8_doc.splitlines():
1072 if line.startswith('#EXT-X-STREAM-INF:'):
1074 for m in kv_rex.finditer(line):
1076 if v.startswith('"'):
1078 last_info[m.group('key')] = v
1079 elif line.startswith('#EXT-X-MEDIA:'):
1081 for m in kv_rex.finditer(line):
1083 if v.startswith('"'):
1085 last_media[m.group('key')] = v
1086 elif line.startswith('#') or not line.strip():
1089 if last_info is None:
1090 formats.append({'url': format_url(line)})
1092 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1095 format_id.append(m3u8_id)
1096 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1097 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1099 'format_id': '-'.join(format_id),
1100 'url': format_url(line.strip()),
1103 'protocol': entry_protocol,
1104 'preference': preference,
1106 resolution = last_info.get('RESOLUTION')
1108 width_str, height_str = resolution.split('x')
1109 f['width'] = int(width_str)
1110 f['height'] = int(height_str)
1111 codecs = last_info.get('CODECS')
1113 vcodec, acodec = [None] * 2
1114 va_codecs = codecs.split(',')
1115 if len(va_codecs) == 1:
1116 # Audio only entries usually come with single codec and
1117 # no resolution. For more robustness we also check it to
1119 if not resolution and va_codecs[0].startswith('mp4a'):
1120 vcodec, acodec = 'none', va_codecs[0]
1122 vcodec = va_codecs[0]
1124 vcodec, acodec = va_codecs[:2]
1129 if last_media is not None:
1130 f['m3u8_media'] = last_media
1134 self._sort_formats(formats)
1138 def _xpath_ns(path, namespace=None):
1142 for c in path.split('/'):
1143 if not c or c == '.':
1146 out.append('{%s}%s' % (namespace, c))
1147 return '/'.join(out)
1149 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1150 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1156 namespace = self._parse_smil_namespace(smil)
1158 return self._parse_smil_formats(
1159 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1161 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1162 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1165 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1167 def _download_smil(self, smil_url, video_id, fatal=True):
1168 return self._download_xml(
1169 smil_url, video_id, 'Downloading SMIL file',
1170 'Unable to download SMIL file', fatal=fatal)
1172 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1173 namespace = self._parse_smil_namespace(smil)
1175 formats = self._parse_smil_formats(
1176 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1177 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1179 video_id = os.path.splitext(url_basename(smil_url))[0]
1183 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1184 name = meta.attrib.get('name')
1185 content = meta.attrib.get('content')
1186 if not name or not content:
1188 if not title and name == 'title':
1190 elif not description and name in ('description', 'abstract'):
1191 description = content
1192 elif not upload_date and name == 'date':
1193 upload_date = unified_strdate(content)
1196 'id': image.get('type'),
1197 'url': image.get('src'),
1198 'width': int_or_none(image.get('width')),
1199 'height': int_or_none(image.get('height')),
1200 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1204 'title': title or video_id,
1205 'description': description,
1206 'upload_date': upload_date,
1207 'thumbnails': thumbnails,
1209 'subtitles': subtitles,
1212 def _parse_smil_namespace(self, smil):
1213 return self._search_regex(
1214 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1216 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1218 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1219 b = meta.get('base') or meta.get('httpBase')
1230 videos = smil.findall(self._xpath_ns('.//video', namespace))
1231 for video in videos:
1232 src = video.get('src')
1233 if not src or src in srcs:
1237 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1238 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1239 width = int_or_none(video.get('width'))
1240 height = int_or_none(video.get('height'))
1241 proto = video.get('proto')
1242 ext = video.get('ext')
1243 src_ext = determine_ext(src)
1244 streamer = video.get('streamer') or base
1246 if proto == 'rtmp' or streamer.startswith('rtmp'):
1252 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1254 'filesize': filesize,
1258 if transform_rtmp_url:
1259 streamer, src = transform_rtmp_url(streamer, src)
1260 formats[-1].update({
1266 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1267 src_url = src_url.strip()
1269 if proto == 'm3u8' or src_ext == 'm3u8':
1270 m3u8_formats = self._extract_m3u8_formats(
1271 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1272 if len(m3u8_formats) == 1:
1274 m3u8_formats[0].update({
1275 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1280 formats.extend(m3u8_formats)
1283 if src_ext == 'f4m':
1288 'plugin': 'flowplayer-3.2.0.1',
1290 f4m_url += '&' if '?' in f4m_url else '?'
1291 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1292 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1295 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1299 'ext': ext or src_ext or 'flv',
1300 'format_id': 'http-%d' % (bitrate or http_count),
1302 'filesize': filesize,
1308 self._sort_formats(formats)
1312 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1315 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1316 src = textstream.get('src')
1317 if not src or src in urls:
1320 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1321 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1322 subtitles.setdefault(lang, []).append({
1328 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1329 xspf = self._download_xml(
1330 playlist_url, playlist_id, 'Downloading xpsf playlist',
1331 'Unable to download xspf manifest', fatal=fatal)
1334 return self._parse_xspf(xspf, playlist_id)
1336 def _parse_xspf(self, playlist, playlist_id):
1338 'xspf': 'http://xspf.org/ns/0/',
1339 's1': 'http://static.streamone.nl/player/ns/0',
1343 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1345 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1346 description = xpath_text(
1347 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1348 thumbnail = xpath_text(
1349 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1350 duration = float_or_none(
1351 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1354 'url': location.text,
1355 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1356 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1357 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1358 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1359 self._sort_formats(formats)
1364 'description': description,
1365 'thumbnail': thumbnail,
1366 'duration': duration,
1371 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1372 res = self._download_webpage_handle(
1374 note=note or 'Downloading MPD manifest',
1375 errnote=errnote or 'Failed to download MPD manifest',
1380 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1382 return self._parse_mpd_formats(
1383 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1385 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1386 if mpd_doc.get('type') == 'dynamic':
1389 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1392 return self._xpath_ns(path, namespace)
1394 def is_drm_protected(element):
1395 return element.find(_add_ns('ContentProtection')) is not None
1397 def extract_multisegment_info(element, ms_parent_info):
1398 ms_info = ms_parent_info.copy()
1399 segment_list = element.find(_add_ns('SegmentList'))
1400 if segment_list is not None:
1401 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1403 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1404 initialization = segment_list.find(_add_ns('Initialization'))
1405 if initialization is not None:
1406 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1408 segment_template = element.find(_add_ns('SegmentTemplate'))
1409 if segment_template is not None:
1410 start_number = segment_template.get('startNumber')
1412 ms_info['start_number'] = int(start_number)
1413 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1414 if segment_timeline is not None:
1415 s_e = segment_timeline.findall(_add_ns('S'))
1417 ms_info['total_number'] = 0
1419 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1421 timescale = segment_template.get('timescale')
1423 ms_info['timescale'] = int(timescale)
1424 segment_duration = segment_template.get('duration')
1425 if segment_duration:
1426 ms_info['segment_duration'] = int(segment_duration)
1427 media_template = segment_template.get('media')
1429 ms_info['media_template'] = media_template
1430 initialization = segment_template.get('initialization')
1432 ms_info['initialization_url'] = initialization
1434 initialization = segment_template.find(_add_ns('Initialization'))
1435 if initialization is not None:
1436 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1439 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1441 for period in mpd_doc.findall(_add_ns('Period')):
1442 period_duration = parse_duration(period.get('duration')) or mpd_duration
1443 period_ms_info = extract_multisegment_info(period, {
1447 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1448 if is_drm_protected(adaptation_set):
1450 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1451 for representation in adaptation_set.findall(_add_ns('Representation')):
1452 if is_drm_protected(representation):
1454 representation_attrib = adaptation_set.attrib.copy()
1455 representation_attrib.update(representation.attrib)
1456 mime_type = representation_attrib.get('mimeType')
1457 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1458 if content_type == 'text':
1459 # TODO implement WebVTT downloading
1461 elif content_type == 'video' or content_type == 'audio':
1463 for element in (representation, adaptation_set, period, mpd_doc):
1464 base_url_e = element.find(_add_ns('BaseURL'))
1465 if base_url_e is not None:
1466 base_url = base_url_e.text + base_url
1467 if re.match(r'^https?://', base_url):
1469 if mpd_base_url and not re.match(r'^https?://', base_url):
1470 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1472 base_url = mpd_base_url + base_url
1473 representation_id = representation_attrib.get('id')
1474 lang = representation_attrib.get('lang')
1475 url_el = representation.find(_add_ns('BaseURL'))
1476 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1478 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1480 'width': int_or_none(representation_attrib.get('width')),
1481 'height': int_or_none(representation_attrib.get('height')),
1482 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1483 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1484 'fps': int_or_none(representation_attrib.get('frameRate')),
1485 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1486 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1487 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1488 'format_note': 'DASH %s' % content_type,
1489 'filesize': filesize,
1491 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1492 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1493 if 'total_number' not in representation_ms_info and 'segment_duration':
1494 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1495 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1496 media_template = representation_ms_info['media_template']
1497 media_template = media_template.replace('$RepresentationID$', representation_id)
1498 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1499 media_template.replace('$$', '$')
1500 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1501 if 'segment_urls' in representation_ms_info:
1503 'segment_urls': representation_ms_info['segment_urls'],
1504 'protocol': 'http_dash_segments',
1506 if 'initialization_url' in representation_ms_info:
1507 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1509 'initialization_url': initialization_url,
1511 if not f.get('url'):
1512 f['url'] = initialization_url
1514 existing_format = next(
1515 fo for fo in formats
1516 if fo['format_id'] == representation_id)
1517 except StopIteration:
1518 full_info = formats_dict.get(representation_id, {}).copy()
1520 formats.append(full_info)
1522 existing_format.update(f)
1524 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1525 self._sort_formats(formats)
1528 def _live_title(self, name):
1529 """ Generate the title for a live video """
1530 now = datetime.datetime.now()
1531 now_str = now.strftime('%Y-%m-%d %H:%M')
1532 return name + ' ' + now_str
1534 def _int(self, v, name, fatal=False, **kwargs):
1535 res = int_or_none(v, **kwargs)
1536 if 'get_attr' in kwargs:
1537 print(getattr(v, kwargs['get_attr']))
1539 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1541 raise ExtractorError(msg)
1543 self._downloader.report_warning(msg)
1546 def _float(self, v, name, fatal=False, **kwargs):
1547 res = float_or_none(v, **kwargs)
1549 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1551 raise ExtractorError(msg)
1553 self._downloader.report_warning(msg)
1556 def _set_cookie(self, domain, name, value, expire_time=None):
1557 cookie = compat_cookiejar.Cookie(
1558 0, name, value, None, None, domain, None,
1559 None, '/', True, False, expire_time, '', None, None, None)
1560 self._downloader.cookiejar.set_cookie(cookie)
1562 def _get_cookies(self, url):
1563 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1564 req = sanitized_Request(url)
1565 self._downloader.cookiejar.add_cookie_header(req)
1566 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1568 def get_testcases(self, include_onlymatching=False):
1569 t = getattr(self, '_TEST', None)
1571 assert not hasattr(self, '_TESTS'), \
1572 '%s has _TEST and _TESTS' % type(self).__name__
1575 tests = getattr(self, '_TESTS', [])
1577 if not include_onlymatching and t.get('only_matching', False):
1579 t['name'] = type(self).__name__[:-len('IE')]
1582 def is_suitable(self, age_limit):
1583 """ Test whether the extractor is generally suitable for the given
1584 age limit (i.e. pornographic sites are not, all others usually are) """
1586 any_restricted = False
1587 for tc in self.get_testcases(include_onlymatching=False):
1588 if 'playlist' in tc:
1589 tc = tc['playlist'][0]
1590 is_restricted = age_restricted(
1591 tc.get('info_dict', {}).get('age_limit'), age_limit)
1592 if not is_restricted:
1594 any_restricted = any_restricted or is_restricted
1595 return not any_restricted
1597 def extract_subtitles(self, *args, **kwargs):
1598 if (self._downloader.params.get('writesubtitles', False) or
1599 self._downloader.params.get('listsubtitles')):
1600 return self._get_subtitles(*args, **kwargs)
1603 def _get_subtitles(self, *args, **kwargs):
1604 raise NotImplementedError('This method must be implemented by subclasses')
1607 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1608 """ Merge subtitle items for one language. Items with duplicated URLs
1609 will be dropped. """
1610 list1_urls = set([item['url'] for item in subtitle_list1])
1611 ret = list(subtitle_list1)
1612 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1616 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1617 """ Merge two subtitle dictionaries, language by language. """
1618 ret = dict(subtitle_dict1)
1619 for lang in subtitle_dict2:
1620 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1623 def extract_automatic_captions(self, *args, **kwargs):
1624 if (self._downloader.params.get('writeautomaticsub', False) or
1625 self._downloader.params.get('listsubtitles')):
1626 return self._get_automatic_captions(*args, **kwargs)
1629 def _get_automatic_captions(self, *args, **kwargs):
1630 raise NotImplementedError('This method must be implemented by subclasses')
1632 def mark_watched(self, *args, **kwargs):
1633 if (self._downloader.params.get('mark_watched', False) and
1634 (self._get_login_info()[0] is not None or
1635 self._downloader.params.get('cookiefile') is not None)):
1636 self._mark_watched(*args, **kwargs)
1638 def _mark_watched(self, *args, **kwargs):
1639 raise NotImplementedError('This method must be implemented by subclasses')
1642 class SearchInfoExtractor(InfoExtractor):
1644 Base class for paged search queries extractors.
1645 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1646 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1650 def _make_valid_url(cls):
1651 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1654 def suitable(cls, url):
1655 return re.match(cls._make_valid_url(), url) is not None
1657 def _real_extract(self, query):
1658 mobj = re.match(self._make_valid_url(), query)
1660 raise ExtractorError('Invalid search query "%s"' % query)
1662 prefix = mobj.group('prefix')
1663 query = mobj.group('query')
1665 return self._get_n_results(query, 1)
1666 elif prefix == 'all':
1667 return self._get_n_results(query, self._MAX_RESULTS)
1671 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1672 elif n > self._MAX_RESULTS:
1673 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1674 n = self._MAX_RESULTS
1675 return self._get_n_results(query, n)
1677 def _get_n_results(self, query, n):
1678 """Get a specified number of results for a query"""
1679 raise NotImplementedError('This method must be implemented by subclasses')
1682 def SEARCH_KEY(self):
1683 return self._SEARCH_KEY