1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
47 class InfoExtractor(object):
48 """Information Extractor class.
50 Information extractors are the classes that, given a URL, extract
51 information about the video (or videos) the URL refers to. This
52 information includes the real video URL, the video title, author and
53 others. The information is stored in a dictionary which is then
54 passed to the YoutubeDL. The YoutubeDL processes this
55 information possibly downloading the video to the file system, among
56 other possible outcomes.
58 The type field determines the type of the result.
59 By far the most common value (and the default if _type is missing) is
60 "video", which indicates a single video.
62 For a video, the dictionaries must include the following fields:
65 title: Video title, unescaped.
67 Additionally, it must contain either a formats entry or a url one:
69 formats: A list of dictionaries for each format available, ordered
70 from worst to best quality.
73 * url Mandatory. The URL of the video file
74 * ext Will be calculated from URL if missing
75 * format A human-readable description of the format
76 ("mp4 container with h264/opus").
77 Calculated from the format_id, width, height.
78 and format_note fields if missing.
79 * format_id A short description of the format
80 ("mp4_h264_opus" or "19").
81 Technically optional, but strongly recommended.
82 * format_note Additional info about the format
83 ("3D" or "DASH video")
84 * width Width of the video, if known
85 * height Height of the video, if known
86 * resolution Textual description of width and height
87 * tbr Average bitrate of audio and video in KBit/s
88 * abr Average audio bitrate in KBit/s
89 * acodec Name of the audio codec in use
90 * asr Audio sampling rate in Hertz
91 * vbr Average video bitrate in KBit/s
93 * vcodec Name of the video codec in use
94 * container Name of the container format
95 * filesize The number of bytes, if known in advance
96 * filesize_approx An estimate for the number of bytes
97 * player_url SWF Player URL (used for rtmpdump).
98 * protocol The protocol that will be used for the actual
100 "http", "https", "rtsp", "rtmp", "rtmpe",
101 "m3u8", or "m3u8_native".
102 * preference Order number of this format. If this field is
103 present and not None, the formats get sorted
104 by this field, regardless of all other values.
105 -1 for default (order by other properties),
106 -2 or smaller for less than default.
107 < -1000 to hide the format (if there is
108 another one which is strictly better)
109 * language_preference Is this in the correct requested
111 10 if it's what the URL is about,
112 -1 for default (don't know),
113 -10 otherwise, other values reserved for now.
114 * quality Order number of the video quality of this
115 format, irrespective of the file format.
116 -1 for default (order by other properties),
117 -2 or smaller for less than default.
118 * source_preference Order number for this video source
119 (quality takes higher priority)
120 -1 for default (order by other properties),
121 -2 or smaller for less than default.
122 * http_headers A dictionary of additional HTTP headers
123 to add to the request.
124 * stretched_ratio If given and not 1, indicates that the
125 video's pixels are not square.
126 width : height ratio as float.
127 * no_resume The server does not support resuming the
128 (HTTP or RTMP) download. Boolean.
130 url: Final video URL.
131 ext: Video filename extension.
132 format: The video format, defaults to ext (used for --get-format)
133 player_url: SWF Player URL (used for rtmpdump).
135 The following fields are optional:
137 alt_title: A secondary title of the video.
138 display_id An alternative identifier for the video, not necessarily
139 unique, but available before title. Typically, id is
140 something like "4234987", title "Dancing naked mole rats",
141 and display_id "dancing-naked-mole-rats"
142 thumbnails: A list of dictionaries, with the following entries:
143 * "id" (optional, string) - Thumbnail format ID
145 * "preference" (optional, int) - quality of the image
146 * "width" (optional, int)
147 * "height" (optional, int)
148 * "resolution" (optional, string "{width}x{height"},
150 thumbnail: Full URL to a video thumbnail image.
151 description: Full video description.
152 uploader: Full name of the video uploader.
153 creator: The main artist who created the video.
154 timestamp: UNIX timestamp of the moment the video became available.
155 upload_date: Video upload date (YYYYMMDD).
156 If not explicitly set, calculated from timestamp.
157 uploader_id: Nickname or id of the video uploader.
158 location: Physical location where the video was filmed.
159 subtitles: The available subtitles as a dictionary in the format
160 {language: subformats}. "subformats" is a list sorted from
161 lower to higher preference, each element is a dictionary
162 with the "ext" entry and one of:
163 * "data": The subtitles file contents
164 * "url": A URL pointing to the subtitles file
165 automatic_captions: Like 'subtitles', used by the YoutubeIE for
166 automatically generated captions
167 duration: Length of the video in seconds, as an integer.
168 view_count: How many users have watched the video on the platform.
169 like_count: Number of positive ratings of the video
170 dislike_count: Number of negative ratings of the video
171 average_rating: Average rating give by users, the scale used depends on the webpage
172 comment_count: Number of comments on the video
173 comments: A list of comments, each with one or more of the following
174 properties (all but one of text or html optional):
175 * "author" - human-readable name of the comment author
176 * "author_id" - user ID of the comment author
178 * "html" - Comment as HTML
179 * "text" - Plain text of the comment
180 * "timestamp" - UNIX timestamp of comment
181 * "parent" - ID of the comment this one is replying to.
182 Set to "root" to indicate that this is a
183 comment to the original video.
184 age_limit: Age restriction for the video, as an integer (years)
185 webpage_url: The URL to the video webpage, if given to youtube-dl it
186 should allow to get the same result again. (It will be set
187 by YoutubeDL if it's missing)
188 categories: A list of categories that the video falls in, for example
190 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
191 is_live: True, False, or None (=unknown). Whether this video is a
192 live stream that goes on instead of a fixed-length video.
193 start_time: Time in seconds where the reproduction should start, as
194 specified in the URL.
195 end_time: Time in seconds where the reproduction should end, as
196 specified in the URL.
198 Unless mentioned otherwise, the fields should be Unicode strings.
200 Unless mentioned otherwise, None is equivalent to absence of information.
203 _type "playlist" indicates multiple videos.
204 There must be a key "entries", which is a list, an iterable, or a PagedList
205 object, each element of which is a valid dictionary by this specification.
207 Additionally, playlists can have "title", "description" and "id" attributes
208 with the same semantics as videos (see above).
211 _type "multi_video" indicates that there are multiple videos that
212 form a single show, for examples multiple acts of an opera or TV episode.
213 It must have an entries key like a playlist and contain all the keys
214 required for a video at the same time.
217 _type "url" indicates that the video must be extracted from another
218 location, possibly by a different extractor. Its only required key is:
219 "url" - the next URL to extract.
220 The key "ie_key" can be set to the class name (minus the trailing "IE",
221 e.g. "Youtube") if the extractor class is known in advance.
222 Additionally, the dictionary may have any properties of the resolved entity
223 known in advance, for example "title" if the title of the referred video is
227 _type "url_transparent" entities have the same specification as "url", but
228 indicate that the given additional information is more precise than the one
229 associated with the resolved URL.
230 This is useful when a site employs a video service that hosts the video and
231 its technical metadata, but that video service does not embed a useful
232 title, description etc.
235 Subclasses of this one should re-define the _real_initialize() and
236 _real_extract() methods and define a _VALID_URL regexp.
237 Probably, they should also be added to the list of extractors.
239 Finally, the _WORKING attribute should be set to False for broken IEs
240 in order to warn the users and skip the tests.
247 def __init__(self, downloader=None):
248 """Constructor. Receives an optional downloader."""
250 self.set_downloader(downloader)
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
256 # This does not use has/getattr intentionally - we want to know whether
257 # we have cached the regexp for *this* class, whereas getattr would also
258 # match the superclass
259 if '_VALID_URL_RE' not in cls.__dict__:
260 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
261 return cls._VALID_URL_RE.match(url) is not None
264 def _match_id(cls, url):
265 if '_VALID_URL_RE' not in cls.__dict__:
266 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
267 m = cls._VALID_URL_RE.match(url)
273 """Getter method for _WORKING."""
276 def initialize(self):
277 """Initializes an instance (authentication, etc)."""
279 self._real_initialize()
282 def extract(self, url):
283 """Extracts URL information and returns it in list of dicts."""
286 return self._real_extract(url)
287 except ExtractorError:
289 except compat_http_client.IncompleteRead as e:
290 raise ExtractorError('A network error has occured.', cause=e, expected=True)
291 except (KeyError, StopIteration) as e:
292 raise ExtractorError('An extractor error has occured.', cause=e)
294 def set_downloader(self, downloader):
295 """Sets the downloader for this IE."""
296 self._downloader = downloader
298 def _real_initialize(self):
299 """Real initialization process. Redefine in subclasses."""
302 def _real_extract(self, url):
303 """Real extraction process. Redefine in subclasses."""
308 """A string for getting the InfoExtractor with get_info_extractor"""
309 return cls.__name__[:-2]
313 return type(self).__name__[:-2]
315 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
316 """ Returns the response handle """
318 self.report_download_webpage(video_id)
319 elif note is not False:
321 self.to_screen('%s' % (note,))
323 self.to_screen('%s: %s' % (video_id, note))
325 return self._downloader.urlopen(url_or_request)
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
330 errnote = 'Unable to download webpage'
331 errmsg = '%s: %s' % (errnote, compat_str(err))
333 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
335 self._downloader.report_warning(errmsg)
338 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
339 """ Returns a tuple (page content as string, URL handle) """
340 # Strip hashes from the URL (#1038)
341 if isinstance(url_or_request, (compat_str, str)):
342 url_or_request = url_or_request.partition('#')[0]
344 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
348 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
349 return (content, urlh)
352 def _guess_encoding_from_content(content_type, webpage_bytes):
353 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
355 encoding = m.group(1)
357 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
358 webpage_bytes[:1024])
360 encoding = m.group(1).decode('ascii')
361 elif webpage_bytes.startswith(b'\xff\xfe'):
368 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
369 content_type = urlh.headers.get('Content-Type', '')
370 webpage_bytes = urlh.read()
371 if prefix is not None:
372 webpage_bytes = prefix + webpage_bytes
374 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
375 if self._downloader.params.get('dump_intermediate_pages', False):
377 url = url_or_request.get_full_url()
378 except AttributeError:
380 self.to_screen('Dumping request to ' + url)
381 dump = base64.b64encode(webpage_bytes).decode('ascii')
382 self._downloader.to_screen(dump)
383 if self._downloader.params.get('write_pages', False):
385 url = url_or_request.get_full_url()
386 except AttributeError:
388 basen = '%s_%s' % (video_id, url)
390 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
391 basen = basen[:240 - len(h)] + h
392 raw_filename = basen + '.dump'
393 filename = sanitize_filename(raw_filename, restricted=True)
394 self.to_screen('Saving request to ' + filename)
395 # Working around MAX_PATH limitation on Windows (see
396 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
398 absfilepath = os.path.abspath(filename)
399 if len(absfilepath) > 259:
400 filename = '\\\\?\\' + absfilepath
401 with open(filename, 'wb') as outf:
402 outf.write(webpage_bytes)
405 content = webpage_bytes.decode(encoding, 'replace')
407 content = webpage_bytes.decode('utf-8', 'replace')
409 if ('<title>Access to this site is blocked</title>' in content and
410 'Websense' in content[:512]):
411 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
412 blocked_iframe = self._html_search_regex(
413 r'<iframe src="([^"]+)"', content,
414 'Websense information URL', default=None)
416 msg += ' Visit %s for more details' % blocked_iframe
417 raise ExtractorError(msg, expected=True)
418 if '<title>The URL you requested has been blocked</title>' in content[:512]:
420 'Access to this webpage has been blocked by Indian censorship. '
421 'Use a VPN or proxy server (with --proxy) to route around it.')
422 block_msg = self._html_search_regex(
423 r'</h1><p>(.*?)</p>',
424 content, 'block message', default=None)
426 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
427 raise ExtractorError(msg, expected=True)
431 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
432 """ Returns the data of the page as a string """
435 while success is False:
437 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
439 except compat_http_client.IncompleteRead as e:
441 if try_count >= tries:
443 self._sleep(timeout, video_id)
450 def _download_xml(self, url_or_request, video_id,
451 note='Downloading XML', errnote='Unable to download XML',
452 transform_source=None, fatal=True, encoding=None):
453 """Return the xml as an xml.etree.ElementTree.Element"""
454 xml_string = self._download_webpage(
455 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
456 if xml_string is False:
459 xml_string = transform_source(xml_string)
460 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
462 def _download_json(self, url_or_request, video_id,
463 note='Downloading JSON metadata',
464 errnote='Unable to download JSON metadata',
465 transform_source=None,
466 fatal=True, encoding=None):
467 json_string = self._download_webpage(
468 url_or_request, video_id, note, errnote, fatal=fatal,
470 if (not fatal) and json_string is False:
472 return self._parse_json(
473 json_string, video_id, transform_source=transform_source, fatal=fatal)
475 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
477 json_string = transform_source(json_string)
479 return json.loads(json_string)
480 except ValueError as ve:
481 errmsg = '%s: Failed to parse JSON ' % video_id
483 raise ExtractorError(errmsg, cause=ve)
485 self.report_warning(errmsg + str(ve))
487 def report_warning(self, msg, video_id=None):
488 idstr = '' if video_id is None else '%s: ' % video_id
489 self._downloader.report_warning(
490 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
492 def to_screen(self, msg):
493 """Print msg to screen, prefixing it with '[ie_name]'"""
494 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
496 def report_extraction(self, id_or_name):
497 """Report information extraction."""
498 self.to_screen('%s: Extracting information' % id_or_name)
500 def report_download_webpage(self, video_id):
501 """Report webpage download."""
502 self.to_screen('%s: Downloading webpage' % video_id)
504 def report_age_confirmation(self):
505 """Report attempt to confirm age."""
506 self.to_screen('Confirming age')
508 def report_login(self):
509 """Report attempt to log in."""
510 self.to_screen('Logging in')
512 # Methods for following #608
514 def url_result(url, ie=None, video_id=None, video_title=None):
515 """Returns a URL that points to a page that should be processed"""
516 # TODO: ie should be the class used for getting the info
517 video_info = {'_type': 'url',
520 if video_id is not None:
521 video_info['id'] = video_id
522 if video_title is not None:
523 video_info['title'] = video_title
527 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
528 """Returns a playlist"""
529 video_info = {'_type': 'playlist',
532 video_info['id'] = playlist_id
534 video_info['title'] = playlist_title
535 if playlist_description:
536 video_info['description'] = playlist_description
539 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
541 Perform a regex search on the given string, using a single or a list of
542 patterns returning the first matching group.
543 In case of failure return a default value or raise a WARNING or a
544 RegexNotFoundError, depending on fatal, specifying the field name.
546 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
547 mobj = re.search(pattern, string, flags)
550 mobj = re.search(p, string, flags)
554 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
555 _name = '\033[0;34m%s\033[0m' % name
561 # return the first matching group
562 return next(g for g in mobj.groups() if g is not None)
564 return mobj.group(group)
565 elif default is not NO_DEFAULT:
568 raise RegexNotFoundError('Unable to extract %s' % _name)
570 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
573 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
575 Like _search_regex, but strips HTML tags and unescapes entities.
577 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
579 return clean_html(res).strip()
583 def _get_login_info(self):
585 Get the login info as (username, password)
586 It will look in the netrc file using the _NETRC_MACHINE value
587 If there's no info available, return (None, None)
589 if self._downloader is None:
594 downloader_params = self._downloader.params
596 # Attempt to use provided username and password or .netrc data
597 if downloader_params.get('username', None) is not None:
598 username = downloader_params['username']
599 password = downloader_params['password']
600 elif downloader_params.get('usenetrc', False):
602 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
607 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
608 except (IOError, netrc.NetrcParseError) as err:
609 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
611 return (username, password)
613 def _get_tfa_info(self):
615 Get the two-factor authentication info
616 TODO - asking the user will be required for sms/phone verify
617 currently just uses the command line option
618 If there's no info available, return None
620 if self._downloader is None:
622 downloader_params = self._downloader.params
624 if downloader_params.get('twofactor', None) is not None:
625 return downloader_params['twofactor']
629 # Helper functions for extracting OpenGraph info
631 def _og_regexes(prop):
632 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
633 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
634 template = r'<meta[^>]+?%s[^>]+?%s'
636 template % (property_re, content_re),
637 template % (content_re, property_re),
641 def _meta_regex(prop):
642 return r'''(?isx)<meta
643 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
644 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
646 def _og_search_property(self, prop, html, name=None, **kargs):
648 name = 'OpenGraph %s' % prop
649 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
652 return unescapeHTML(escaped)
654 def _og_search_thumbnail(self, html, **kargs):
655 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
657 def _og_search_description(self, html, **kargs):
658 return self._og_search_property('description', html, fatal=False, **kargs)
660 def _og_search_title(self, html, **kargs):
661 return self._og_search_property('title', html, **kargs)
663 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
664 regexes = self._og_regexes('video') + self._og_regexes('video:url')
666 regexes = self._og_regexes('video:secure_url') + regexes
667 return self._html_search_regex(regexes, html, name, **kargs)
669 def _og_search_url(self, html, **kargs):
670 return self._og_search_property('url', html, **kargs)
672 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
673 if display_name is None:
675 return self._html_search_regex(
676 self._meta_regex(name),
677 html, display_name, fatal=fatal, group='content', **kwargs)
679 def _dc_search_uploader(self, html):
680 return self._html_search_meta('dc.creator', html, 'uploader')
682 def _rta_search(self, html):
683 # See http://www.rtalabel.org/index.php?content=howtofaq#single
684 if re.search(r'(?ix)<meta\s+name="rating"\s+'
685 r' content="RTA-5042-1996-1400-1577-RTA"',
690 def _media_rating_search(self, html):
691 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
692 rating = self._html_search_meta('rating', html)
704 return RATING_TABLE.get(rating.lower(), None)
706 def _family_friendly_search(self, html):
707 # See http://schema.org/VideoObject
708 family_friendly = self._html_search_meta('isFamilyFriendly', html)
710 if not family_friendly:
719 return RATING_TABLE.get(family_friendly.lower(), None)
721 def _twitter_search_player(self, html):
722 return self._html_search_meta('twitter:player', html,
723 'twitter card player')
726 def _hidden_inputs(html):
728 for input in re.findall(r'<input([^>]+)>', html):
729 if not re.search(r'type=(["\'])hidden\1', input):
731 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
734 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
737 hidden_inputs[name.group('value')] = value.group('value')
740 def _form_hidden_inputs(self, form_id, html):
741 form = self._search_regex(
742 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
743 html, '%s form' % form_id, group='form')
744 return self._hidden_inputs(form)
746 def _sort_formats(self, formats, field_preference=None):
748 raise ExtractorError('No video formats found')
751 # TODO remove the following workaround
752 from ..utils import determine_ext
753 if not f.get('ext') and 'url' in f:
754 f['ext'] = determine_ext(f['url'])
756 if isinstance(field_preference, (list, tuple)):
757 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
759 preference = f.get('preference')
760 if preference is None:
761 proto = f.get('protocol')
763 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
765 preference = 0 if proto in ['http', 'https'] else -0.1
766 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
769 if f.get('vcodec') == 'none': # audio only
770 if self._downloader.params.get('prefer_free_formats'):
771 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
773 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
776 audio_ext_preference = ORDER.index(f['ext'])
778 audio_ext_preference = -1
780 if self._downloader.params.get('prefer_free_formats'):
781 ORDER = ['flv', 'mp4', 'webm']
783 ORDER = ['webm', 'flv', 'mp4']
785 ext_preference = ORDER.index(f['ext'])
788 audio_ext_preference = 0
792 f.get('language_preference') if f.get('language_preference') is not None else -1,
793 f.get('quality') if f.get('quality') is not None else -1,
794 f.get('tbr') if f.get('tbr') is not None else -1,
795 f.get('filesize') if f.get('filesize') is not None else -1,
796 f.get('vbr') if f.get('vbr') is not None else -1,
797 f.get('height') if f.get('height') is not None else -1,
798 f.get('width') if f.get('width') is not None else -1,
800 f.get('abr') if f.get('abr') is not None else -1,
801 audio_ext_preference,
802 f.get('fps') if f.get('fps') is not None else -1,
803 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
804 f.get('source_preference') if f.get('source_preference') is not None else -1,
805 f.get('format_id') if f.get('format_id') is not None else '',
807 formats.sort(key=_formats_key)
809 def _check_formats(self, formats, video_id):
812 lambda f: self._is_valid_url(
814 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
817 def _is_valid_url(self, url, video_id, item='video'):
818 url = self._proto_relative_url(url, scheme='http:')
819 # For now assume non HTTP(S) URLs always valid
820 if not (url.startswith('http://') or url.startswith('https://')):
823 self._request_webpage(url, video_id, 'Checking %s URL' % item)
825 except ExtractorError as e:
826 if isinstance(e.cause, compat_HTTPError):
828 '%s: %s URL is invalid, skipping' % (video_id, item))
832 def http_scheme(self):
833 """ Either "http:" or "https:", depending on the user's preferences """
836 if self._downloader.params.get('prefer_insecure', False)
839 def _proto_relative_url(self, url, scheme=None):
842 if url.startswith('//'):
844 scheme = self.http_scheme()
849 def _sleep(self, timeout, video_id, msg_template=None):
850 if msg_template is None:
851 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
852 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
856 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
857 transform_source=lambda s: fix_xml_ampersands(s).strip()):
858 manifest = self._download_xml(
859 manifest_url, video_id, 'Downloading f4m manifest',
860 'Unable to download f4m manifest',
861 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
862 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
863 transform_source=transform_source)
866 manifest_version = '1.0'
867 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
869 manifest_version = '2.0'
870 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
871 for i, media_el in enumerate(media_nodes):
872 if manifest_version == '2.0':
873 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
877 media_url if media_url.startswith('http://') or media_url.startswith('https://')
878 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
879 # If media_url is itself a f4m manifest do the recursive extraction
880 # since bitrates in parent manifest (this one) and media_url manifest
881 # may differ leading to inability to resolve the format by requested
882 # bitrate in f4m downloader
883 if determine_ext(manifest_url) == 'f4m':
884 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
886 tbr = int_or_none(media_el.attrib.get('bitrate'))
888 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
892 'width': int_or_none(media_el.attrib.get('width')),
893 'height': int_or_none(media_el.attrib.get('height')),
894 'preference': preference,
896 self._sort_formats(formats)
900 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
901 entry_protocol='m3u8', preference=None,
902 m3u8_id=None, note=None, errnote=None,
906 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
910 'preference': preference - 1 if preference else -1,
911 'resolution': 'multiple',
912 'format_note': 'Quality selection URL',
915 format_url = lambda u: (
917 if re.match(r'^https?://', u)
918 else compat_urlparse.urljoin(m3u8_url, u))
920 m3u8_doc = self._download_webpage(
922 note=note or 'Downloading m3u8 information',
923 errnote=errnote or 'Failed to download m3u8 information',
925 if m3u8_doc is False:
930 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
931 for line in m3u8_doc.splitlines():
932 if line.startswith('#EXT-X-STREAM-INF:'):
934 for m in kv_rex.finditer(line):
936 if v.startswith('"'):
938 last_info[m.group('key')] = v
939 elif line.startswith('#EXT-X-MEDIA:'):
941 for m in kv_rex.finditer(line):
943 if v.startswith('"'):
945 last_media[m.group('key')] = v
946 elif line.startswith('#') or not line.strip():
949 if last_info is None:
950 formats.append({'url': format_url(line)})
952 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
955 format_id.append(m3u8_id)
956 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
957 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
959 'format_id': '-'.join(format_id),
960 'url': format_url(line.strip()),
963 'protocol': entry_protocol,
964 'preference': preference,
966 codecs = last_info.get('CODECS')
968 # TODO: looks like video codec is not always necessarily goes first
969 va_codecs = codecs.split(',')
971 f['vcodec'] = va_codecs[0].partition('.')[0]
972 if len(va_codecs) > 1 and va_codecs[1]:
973 f['acodec'] = va_codecs[1].partition('.')[0]
974 resolution = last_info.get('RESOLUTION')
976 width_str, height_str = resolution.split('x')
977 f['width'] = int(width_str)
978 f['height'] = int(height_str)
979 if last_media is not None:
980 f['m3u8_media'] = last_media
984 self._sort_formats(formats)
988 def _xpath_ns(path, namespace=None):
992 for c in path.split('/'):
993 if not c or c == '.':
996 out.append('{%s}%s' % (namespace, c))
999 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1000 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1006 namespace = self._parse_smil_namespace(smil)
1008 return self._parse_smil_formats(
1009 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1011 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1012 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1015 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1017 def _download_smil(self, smil_url, video_id, fatal=True):
1018 return self._download_xml(
1019 smil_url, video_id, 'Downloading SMIL file',
1020 'Unable to download SMIL file', fatal=fatal)
1022 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1023 namespace = self._parse_smil_namespace(smil)
1025 formats = self._parse_smil_formats(
1026 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1027 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1029 video_id = os.path.splitext(url_basename(smil_url))[0]
1032 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1033 name = meta.attrib.get('name')
1034 content = meta.attrib.get('content')
1035 if not name or not content:
1037 if not title and name == 'title':
1039 elif not description and name in ('description', 'abstract'):
1040 description = content
1044 'title': title or video_id,
1045 'description': description,
1047 'subtitles': subtitles,
1050 def _parse_smil_namespace(self, smil):
1051 return self._search_regex(
1052 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1054 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1056 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1057 b = meta.get('base') or meta.get('httpBase')
1066 videos = smil.findall(self._xpath_ns('.//video', namespace))
1067 for video in videos:
1068 src = video.get('src')
1072 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1073 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1074 width = int_or_none(video.get('width'))
1075 height = int_or_none(video.get('height'))
1076 proto = video.get('proto')
1077 ext = video.get('ext')
1078 src_ext = determine_ext(src)
1079 streamer = video.get('streamer') or base
1081 if proto == 'rtmp' or streamer.startswith('rtmp'):
1087 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1089 'filesize': filesize,
1095 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1097 if proto == 'm3u8' or src_ext == 'm3u8':
1098 formats.extend(self._extract_m3u8_formats(
1099 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1102 if src_ext == 'f4m':
1107 'plugin': 'flowplayer-3.2.0.1',
1109 f4m_url += '&' if '?' in f4m_url else '?'
1110 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1111 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1114 if src_url.startswith('http'):
1118 'ext': ext or src_ext or 'flv',
1119 'format_id': 'http-%d' % (bitrate or http_count),
1121 'filesize': filesize,
1127 self._sort_formats(formats)
1131 def _parse_smil_subtitles(self, smil, namespace=None):
1133 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1134 src = textstream.get('src')
1137 ext = textstream.get('ext') or determine_ext(src)
1139 type_ = textstream.get('type')
1140 if type_ == 'text/srt':
1142 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1143 subtitles.setdefault(lang, []).append({
1149 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1150 xspf = self._download_xml(
1151 playlist_url, playlist_id, 'Downloading xpsf playlist',
1152 'Unable to download xspf manifest', fatal=fatal)
1155 return self._parse_xspf(xspf, playlist_id)
1157 def _parse_xspf(self, playlist, playlist_id):
1159 'xspf': 'http://xspf.org/ns/0/',
1160 's1': 'http://static.streamone.nl/player/ns/0',
1164 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1166 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1167 description = xpath_text(
1168 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1169 thumbnail = xpath_text(
1170 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1171 duration = float_or_none(
1172 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1175 'url': location.text,
1176 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1177 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1178 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1179 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1180 self._sort_formats(formats)
1185 'description': description,
1186 'thumbnail': thumbnail,
1187 'duration': duration,
1192 def _live_title(self, name):
1193 """ Generate the title for a live video """
1194 now = datetime.datetime.now()
1195 now_str = now.strftime("%Y-%m-%d %H:%M")
1196 return name + ' ' + now_str
1198 def _int(self, v, name, fatal=False, **kwargs):
1199 res = int_or_none(v, **kwargs)
1200 if 'get_attr' in kwargs:
1201 print(getattr(v, kwargs['get_attr']))
1203 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1205 raise ExtractorError(msg)
1207 self._downloader.report_warning(msg)
1210 def _float(self, v, name, fatal=False, **kwargs):
1211 res = float_or_none(v, **kwargs)
1213 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1215 raise ExtractorError(msg)
1217 self._downloader.report_warning(msg)
1220 def _set_cookie(self, domain, name, value, expire_time=None):
1221 cookie = compat_cookiejar.Cookie(
1222 0, name, value, None, None, domain, None,
1223 None, '/', True, False, expire_time, '', None, None, None)
1224 self._downloader.cookiejar.set_cookie(cookie)
1226 def _get_cookies(self, url):
1227 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1228 req = compat_urllib_request.Request(url)
1229 self._downloader.cookiejar.add_cookie_header(req)
1230 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1232 def get_testcases(self, include_onlymatching=False):
1233 t = getattr(self, '_TEST', None)
1235 assert not hasattr(self, '_TESTS'), \
1236 '%s has _TEST and _TESTS' % type(self).__name__
1239 tests = getattr(self, '_TESTS', [])
1241 if not include_onlymatching and t.get('only_matching', False):
1243 t['name'] = type(self).__name__[:-len('IE')]
1246 def is_suitable(self, age_limit):
1247 """ Test whether the extractor is generally suitable for the given
1248 age limit (i.e. pornographic sites are not, all others usually are) """
1250 any_restricted = False
1251 for tc in self.get_testcases(include_onlymatching=False):
1252 if 'playlist' in tc:
1253 tc = tc['playlist'][0]
1254 is_restricted = age_restricted(
1255 tc.get('info_dict', {}).get('age_limit'), age_limit)
1256 if not is_restricted:
1258 any_restricted = any_restricted or is_restricted
1259 return not any_restricted
1261 def extract_subtitles(self, *args, **kwargs):
1262 if (self._downloader.params.get('writesubtitles', False) or
1263 self._downloader.params.get('listsubtitles')):
1264 return self._get_subtitles(*args, **kwargs)
1267 def _get_subtitles(self, *args, **kwargs):
1268 raise NotImplementedError("This method must be implemented by subclasses")
1270 def extract_automatic_captions(self, *args, **kwargs):
1271 if (self._downloader.params.get('writeautomaticsub', False) or
1272 self._downloader.params.get('listsubtitles')):
1273 return self._get_automatic_captions(*args, **kwargs)
1276 def _get_automatic_captions(self, *args, **kwargs):
1277 raise NotImplementedError("This method must be implemented by subclasses")
1280 class SearchInfoExtractor(InfoExtractor):
1282 Base class for paged search queries extractors.
1283 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1284 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1288 def _make_valid_url(cls):
1289 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1292 def suitable(cls, url):
1293 return re.match(cls._make_valid_url(), url) is not None
1295 def _real_extract(self, query):
1296 mobj = re.match(self._make_valid_url(), query)
1298 raise ExtractorError('Invalid search query "%s"' % query)
1300 prefix = mobj.group('prefix')
1301 query = mobj.group('query')
1303 return self._get_n_results(query, 1)
1304 elif prefix == 'all':
1305 return self._get_n_results(query, self._MAX_RESULTS)
1309 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1310 elif n > self._MAX_RESULTS:
1311 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1312 n = self._MAX_RESULTS
1313 return self._get_n_results(query, n)
1315 def _get_n_results(self, query, n):
1316 """Get a specified number of results for a query"""
1317 raise NotImplementedError("This method must be implemented by subclasses")
1320 def SEARCH_KEY(self):
1321 return self._SEARCH_KEY