10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
25 _NO_DEFAULT = object()
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
42 title: Video title, unescaped.
44 Additionally, it must contain either a formats entry or a url one:
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * filesize_approx An estimate for the number of bytes
73 * player_url SWF Player URL (used for rtmpdump).
74 * protocol The protocol that will be used for the actual
76 "http", "https", "rtsp", "rtmp", "m3u8" or so.
77 * preference Order number of this format. If this field is
78 present and not None, the formats get sorted
79 by this field, regardless of all other values.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
82 * quality Order number of the video quality of this
83 format, irrespective of the file format.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
87 ext: Video filename extension.
88 format: The video format, defaults to ext (used for --get-format)
89 player_url: SWF Player URL (used for rtmpdump).
91 The following fields are optional:
93 display_id An alternative identifier for the video, not necessarily
94 unique, but available before title. Typically, id is
95 something like "4234987", title "Dancing naked mole rats",
96 and display_id "dancing-naked-mole-rats"
97 thumbnails: A list of dictionaries, with the following entries:
99 * "width" (optional, int)
100 * "height" (optional, int)
101 * "resolution" (optional, string "{width}x{height"},
103 thumbnail: Full URL to a video thumbnail image.
104 description: One-line video description.
105 uploader: Full name of the video uploader.
106 timestamp: UNIX timestamp of the moment the video became available.
107 upload_date: Video upload date (YYYYMMDD).
108 If not explicitly set, calculated from timestamp.
109 uploader_id: Nickname or id of the video uploader.
110 location: Physical location of the video.
111 subtitles: The subtitle file contents as a dictionary in the format
112 {language: subtitles}.
113 duration: Length of the video in seconds, as an integer.
114 view_count: How many users have watched the video on the platform.
115 like_count: Number of positive ratings of the video
116 dislike_count: Number of negative ratings of the video
117 comment_count: Number of comments on the video
118 age_limit: Age restriction for the video, as an integer (years)
119 webpage_url: The url to the video webpage, if given to youtube-dl it
120 should allow to get the same result again. (It will be set
121 by YoutubeDL if it's missing)
122 categories: A list of categories that the video falls in, for example
125 Unless mentioned otherwise, the fields should be Unicode strings.
127 Subclasses of this one should re-define the _real_initialize() and
128 _real_extract() methods and define a _VALID_URL regexp.
129 Probably, they should also be added to the list of extractors.
131 Finally, the _WORKING attribute should be set to False for broken IEs
132 in order to warn the users and skip the tests.
139 def __init__(self, downloader=None):
140 """Constructor. Receives an optional downloader."""
142 self.set_downloader(downloader)
145 def suitable(cls, url):
146 """Receives a URL and returns True if suitable for this IE."""
148 # This does not use has/getattr intentionally - we want to know whether
149 # we have cached the regexp for *this* class, whereas getattr would also
150 # match the superclass
151 if '_VALID_URL_RE' not in cls.__dict__:
152 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
153 return cls._VALID_URL_RE.match(url) is not None
157 """Getter method for _WORKING."""
160 def initialize(self):
161 """Initializes an instance (authentication, etc)."""
163 self._real_initialize()
166 def extract(self, url):
167 """Extracts URL information and returns it in list of dicts."""
169 return self._real_extract(url)
171 def set_downloader(self, downloader):
172 """Sets the downloader for this IE."""
173 self._downloader = downloader
175 def _real_initialize(self):
176 """Real initialization process. Redefine in subclasses."""
179 def _real_extract(self, url):
180 """Real extraction process. Redefine in subclasses."""
185 """A string for getting the InfoExtractor with get_info_extractor"""
186 return cls.__name__[:-2]
190 return type(self).__name__[:-2]
192 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
193 """ Returns the response handle """
195 self.report_download_webpage(video_id)
196 elif note is not False:
198 self.to_screen(u'%s' % (note,))
200 self.to_screen(u'%s: %s' % (video_id, note))
202 return self._downloader.urlopen(url_or_request)
203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
207 errnote = u'Unable to download webpage'
208 errmsg = u'%s: %s' % (errnote, compat_str(err))
210 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
212 self._downloader.report_warning(errmsg)
215 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
216 """ Returns a tuple (page content as string, URL handle) """
218 # Strip hashes from the URL (#1038)
219 if isinstance(url_or_request, (compat_str, str)):
220 url_or_request = url_or_request.partition('#')[0]
222 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
226 content_type = urlh.headers.get('Content-Type', '')
227 webpage_bytes = urlh.read()
228 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
230 encoding = m.group(1)
232 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
233 webpage_bytes[:1024])
235 encoding = m.group(1).decode('ascii')
236 elif webpage_bytes.startswith(b'\xff\xfe'):
240 if self._downloader.params.get('dump_intermediate_pages', False):
242 url = url_or_request.get_full_url()
243 except AttributeError:
245 self.to_screen(u'Dumping request to ' + url)
246 dump = base64.b64encode(webpage_bytes).decode('ascii')
247 self._downloader.to_screen(dump)
248 if self._downloader.params.get('write_pages', False):
250 url = url_or_request.get_full_url()
251 except AttributeError:
253 basen = '%s_%s' % (video_id, url)
255 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
256 basen = basen[:240 - len(h)] + h
257 raw_filename = basen + '.dump'
258 filename = sanitize_filename(raw_filename, restricted=True)
259 self.to_screen(u'Saving request to ' + filename)
260 with open(filename, 'wb') as outf:
261 outf.write(webpage_bytes)
264 content = webpage_bytes.decode(encoding, 'replace')
266 content = webpage_bytes.decode('utf-8', 'replace')
268 if (u'<title>Access to this site is blocked</title>' in content and
269 u'Websense' in content[:512]):
270 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
271 blocked_iframe = self._html_search_regex(
272 r'<iframe src="([^"]+)"', content,
273 u'Websense information URL', default=None)
275 msg += u' Visit %s for more details' % blocked_iframe
276 raise ExtractorError(msg, expected=True)
278 return (content, urlh)
280 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
281 """ Returns the data of the page as a string """
282 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
289 def _download_xml(self, url_or_request, video_id,
290 note=u'Downloading XML', errnote=u'Unable to download XML',
291 transform_source=None, fatal=True):
292 """Return the xml as an xml.etree.ElementTree.Element"""
293 xml_string = self._download_webpage(
294 url_or_request, video_id, note, errnote, fatal=fatal)
295 if xml_string is False:
298 xml_string = transform_source(xml_string)
299 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
301 def _download_json(self, url_or_request, video_id,
302 note=u'Downloading JSON metadata',
303 errnote=u'Unable to download JSON metadata',
304 transform_source=None,
306 json_string = self._download_webpage(
307 url_or_request, video_id, note, errnote, fatal=fatal)
308 if (not fatal) and json_string is False:
311 json_string = transform_source(json_string)
313 return json.loads(json_string)
314 except ValueError as ve:
315 raise ExtractorError('Failed to download JSON', cause=ve)
317 def report_warning(self, msg, video_id=None):
318 idstr = u'' if video_id is None else u'%s: ' % video_id
319 self._downloader.report_warning(
320 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
322 def to_screen(self, msg):
323 """Print msg to screen, prefixing it with '[ie_name]'"""
324 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
326 def report_extraction(self, id_or_name):
327 """Report information extraction."""
328 self.to_screen(u'%s: Extracting information' % id_or_name)
330 def report_download_webpage(self, video_id):
331 """Report webpage download."""
332 self.to_screen(u'%s: Downloading webpage' % video_id)
334 def report_age_confirmation(self):
335 """Report attempt to confirm age."""
336 self.to_screen(u'Confirming age')
338 def report_login(self):
339 """Report attempt to log in."""
340 self.to_screen(u'Logging in')
342 #Methods for following #608
344 def url_result(url, ie=None, video_id=None):
345 """Returns a url that points to a page that should be processed"""
346 #TODO: ie should be the class used for getting the info
347 video_info = {'_type': 'url',
350 if video_id is not None:
351 video_info['id'] = video_id
354 def playlist_result(entries, playlist_id=None, playlist_title=None):
355 """Returns a playlist"""
356 video_info = {'_type': 'playlist',
359 video_info['id'] = playlist_id
361 video_info['title'] = playlist_title
364 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
366 Perform a regex search on the given string, using a single or a list of
367 patterns returning the first matching group.
368 In case of failure return a default value or raise a WARNING or a
369 RegexNotFoundError, depending on fatal, specifying the field name.
371 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
372 mobj = re.search(pattern, string, flags)
375 mobj = re.search(p, string, flags)
378 if os.name != 'nt' and sys.stderr.isatty():
379 _name = u'\033[0;34m%s\033[0m' % name
384 # return the first matching group
385 return next(g for g in mobj.groups() if g is not None)
386 elif default is not _NO_DEFAULT:
389 raise RegexNotFoundError(u'Unable to extract %s' % _name)
391 self._downloader.report_warning(u'unable to extract %s; '
392 u'please report this issue on http://yt-dl.org/bug' % _name)
395 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
397 Like _search_regex, but strips HTML tags and unescapes entities.
399 res = self._search_regex(pattern, string, name, default, fatal, flags)
401 return clean_html(res).strip()
405 def _get_login_info(self):
407 Get the the login info as (username, password)
408 It will look in the netrc file using the _NETRC_MACHINE value
409 If there's no info available, return (None, None)
411 if self._downloader is None:
416 downloader_params = self._downloader.params
418 # Attempt to use provided username and password or .netrc data
419 if downloader_params.get('username', None) is not None:
420 username = downloader_params['username']
421 password = downloader_params['password']
422 elif downloader_params.get('usenetrc', False):
424 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
429 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
430 except (IOError, netrc.NetrcParseError) as err:
431 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
433 return (username, password)
435 # Helper functions for extracting OpenGraph info
437 def _og_regexes(prop):
438 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
439 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
440 template = r'<meta[^>]+?%s[^>]+?%s'
442 template % (property_re, content_re),
443 template % (content_re, property_re),
446 def _og_search_property(self, prop, html, name=None, **kargs):
448 name = 'OpenGraph %s' % prop
449 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
452 return unescapeHTML(escaped)
454 def _og_search_thumbnail(self, html, **kargs):
455 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
457 def _og_search_description(self, html, **kargs):
458 return self._og_search_property('description', html, fatal=False, **kargs)
460 def _og_search_title(self, html, **kargs):
461 return self._og_search_property('title', html, **kargs)
463 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
464 regexes = self._og_regexes('video')
465 if secure: regexes = self._og_regexes('video:secure_url') + regexes
466 return self._html_search_regex(regexes, html, name, **kargs)
468 def _og_search_url(self, html, **kargs):
469 return self._og_search_property('url', html, **kargs)
471 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
472 if display_name is None:
474 return self._html_search_regex(
476 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
477 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
478 html, display_name, fatal=fatal, **kwargs)
480 def _dc_search_uploader(self, html):
481 return self._html_search_meta('dc.creator', html, 'uploader')
483 def _rta_search(self, html):
484 # See http://www.rtalabel.org/index.php?content=howtofaq#single
485 if re.search(r'(?ix)<meta\s+name="rating"\s+'
486 r' content="RTA-5042-1996-1400-1577-RTA"',
491 def _media_rating_search(self, html):
492 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
493 rating = self._html_search_meta('rating', html)
505 return RATING_TABLE.get(rating.lower(), None)
507 def _twitter_search_player(self, html):
508 return self._html_search_meta('twitter:player', html,
509 'twitter card player')
511 def _sort_formats(self, formats):
513 raise ExtractorError(u'No video formats found')
516 # TODO remove the following workaround
517 from ..utils import determine_ext
518 if not f.get('ext') and 'url' in f:
519 f['ext'] = determine_ext(f['url'])
521 preference = f.get('preference')
522 if preference is None:
523 proto = f.get('protocol')
525 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
527 preference = 0 if proto in ['http', 'https'] else -0.1
528 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
531 if f.get('vcodec') == 'none': # audio only
532 if self._downloader.params.get('prefer_free_formats'):
533 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
535 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
538 audio_ext_preference = ORDER.index(f['ext'])
540 audio_ext_preference = -1
542 if self._downloader.params.get('prefer_free_formats'):
543 ORDER = [u'flv', u'mp4', u'webm']
545 ORDER = [u'webm', u'flv', u'mp4']
547 ext_preference = ORDER.index(f['ext'])
550 audio_ext_preference = 0
554 f.get('quality') if f.get('quality') is not None else -1,
555 f.get('height') if f.get('height') is not None else -1,
556 f.get('width') if f.get('width') is not None else -1,
558 f.get('tbr') if f.get('tbr') is not None else -1,
559 f.get('vbr') if f.get('vbr') is not None else -1,
560 f.get('abr') if f.get('abr') is not None else -1,
561 audio_ext_preference,
562 f.get('filesize') if f.get('filesize') is not None else -1,
563 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
566 formats.sort(key=_formats_key)
568 def http_scheme(self):
569 """ Either "https:" or "https:", depending on the user's preferences """
572 if self._downloader.params.get('prefer_insecure', False)
575 def _proto_relative_url(self, url, scheme=None):
578 if url.startswith('//'):
580 scheme = self.http_scheme()
585 def _sleep(self, timeout, video_id, msg_template=None):
586 if msg_template is None:
587 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
588 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
593 class SearchInfoExtractor(InfoExtractor):
595 Base class for paged search queries extractors.
596 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
597 Instances should define _SEARCH_KEY and _MAX_RESULTS.
601 def _make_valid_url(cls):
602 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
605 def suitable(cls, url):
606 return re.match(cls._make_valid_url(), url) is not None
608 def _real_extract(self, query):
609 mobj = re.match(self._make_valid_url(), query)
611 raise ExtractorError(u'Invalid search query "%s"' % query)
613 prefix = mobj.group('prefix')
614 query = mobj.group('query')
616 return self._get_n_results(query, 1)
617 elif prefix == 'all':
618 return self._get_n_results(query, self._MAX_RESULTS)
622 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
623 elif n > self._MAX_RESULTS:
624 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
625 n = self._MAX_RESULTS
626 return self._get_n_results(query, n)
628 def _get_n_results(self, query, n):
629 """Get a specified number of results for a query"""
630 raise NotImplementedError("This method must be implemented by subclasses")
633 def SEARCH_KEY(self):
634 return self._SEARCH_KEY