9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * filesize The number of bytes, if known in advance
70 * player_url SWF Player URL (used for rtmpdump).
71 * protocol The protocol that will be used for the actual
73 "http", "https", "rtsp", "rtmp" or so.
74 * preference Order number of this format. If this field is
75 present and not None, the formats get sorted
77 -1 for default (order by other properties),
78 -2 or smaller for less than default.
79 * quality Order number of the video quality of this
80 format, irrespective of the file format.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
84 ext: Video filename extension.
85 format: The video format, defaults to ext (used for --get-format)
86 player_url: SWF Player URL (used for rtmpdump).
88 The following fields are optional:
90 thumbnails: A list of dictionaries (with the entries "resolution" and
91 "url") for the varying thumbnails
92 thumbnail: Full URL to a video thumbnail image.
93 description: One-line video description.
94 uploader: Full name of the video uploader.
95 upload_date: Video upload date (YYYYMMDD).
96 uploader_id: Nickname or id of the video uploader.
97 location: Physical location of the video.
98 subtitles: The subtitle file contents as a dictionary in the format
99 {language: subtitles}.
100 duration: Length of the video in seconds, as an integer.
101 view_count: How many users have watched the video on the platform.
102 like_count: Number of positive ratings of the video
103 dislike_count: Number of negative ratings of the video
104 comment_count: Number of comments on the video
105 age_limit: Age restriction for the video, as an integer (years)
106 webpage_url: The url to the video webpage, if given to youtube-dl it
107 should allow to get the same result again. (It will be set
108 by YoutubeDL if it's missing)
110 Unless mentioned otherwise, the fields should be Unicode strings.
112 Subclasses of this one should re-define the _real_initialize() and
113 _real_extract() methods and define a _VALID_URL regexp.
114 Probably, they should also be added to the list of extractors.
116 _real_extract() must return a *list* of information dictionaries as
119 Finally, the _WORKING attribute should be set to False for broken IEs
120 in order to warn the users and skip the tests.
127 def __init__(self, downloader=None):
128 """Constructor. Receives an optional downloader."""
130 self.set_downloader(downloader)
133 def suitable(cls, url):
134 """Receives a URL and returns True if suitable for this IE."""
136 # This does not use has/getattr intentionally - we want to know whether
137 # we have cached the regexp for *this* class, whereas getattr would also
138 # match the superclass
139 if '_VALID_URL_RE' not in cls.__dict__:
140 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
141 return cls._VALID_URL_RE.match(url) is not None
145 """Getter method for _WORKING."""
148 def initialize(self):
149 """Initializes an instance (authentication, etc)."""
151 self._real_initialize()
154 def extract(self, url):
155 """Extracts URL information and returns it in list of dicts."""
157 return self._real_extract(url)
159 def set_downloader(self, downloader):
160 """Sets the downloader for this IE."""
161 self._downloader = downloader
163 def _real_initialize(self):
164 """Real initialization process. Redefine in subclasses."""
167 def _real_extract(self, url):
168 """Real extraction process. Redefine in subclasses."""
173 """A string for getting the InfoExtractor with get_info_extractor"""
174 return cls.__name__[:-2]
178 return type(self).__name__[:-2]
180 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
181 """ Returns the response handle """
183 self.report_download_webpage(video_id)
184 elif note is not False:
186 self.to_screen(u'%s' % (note,))
188 self.to_screen(u'%s: %s' % (video_id, note))
190 return self._downloader.urlopen(url_or_request)
191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
195 errnote = u'Unable to download webpage'
196 errmsg = u'%s: %s' % (errnote, compat_str(err))
198 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
200 self._downloader.report_warning(errmsg)
203 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
204 """ Returns a tuple (page content as string, URL handle) """
206 # Strip hashes from the URL (#1038)
207 if isinstance(url_or_request, (compat_str, str)):
208 url_or_request = url_or_request.partition('#')[0]
210 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
214 content_type = urlh.headers.get('Content-Type', '')
215 webpage_bytes = urlh.read()
216 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
218 encoding = m.group(1)
220 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
221 webpage_bytes[:1024])
223 encoding = m.group(1).decode('ascii')
226 if self._downloader.params.get('dump_intermediate_pages', False):
228 url = url_or_request.get_full_url()
229 except AttributeError:
231 self.to_screen(u'Dumping request to ' + url)
232 dump = base64.b64encode(webpage_bytes).decode('ascii')
233 self._downloader.to_screen(dump)
234 if self._downloader.params.get('write_pages', False):
236 url = url_or_request.get_full_url()
237 except AttributeError:
240 h = hashlib.md5(url).hexdigest()
241 url = url[:200 - len(h)] + h
242 raw_filename = ('%s_%s.dump' % (video_id, url))
243 filename = sanitize_filename(raw_filename, restricted=True)
244 self.to_screen(u'Saving request to ' + filename)
245 with open(filename, 'wb') as outf:
246 outf.write(webpage_bytes)
248 content = webpage_bytes.decode(encoding, 'replace')
249 return (content, urlh)
251 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
252 """ Returns the data of the page as a string """
253 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
260 def _download_xml(self, url_or_request, video_id,
261 note=u'Downloading XML', errnote=u'Unable to download XML',
262 transform_source=None):
263 """Return the xml as an xml.etree.ElementTree.Element"""
264 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
266 xml_string = transform_source(xml_string)
267 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
269 def _download_json(self, url_or_request, video_id,
270 note=u'Downloading JSON metadata',
271 errnote=u'Unable to download JSON metadata'):
272 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
274 return json.loads(json_string)
275 except ValueError as ve:
276 raise ExtractorError('Failed to download JSON', cause=ve)
278 def report_warning(self, msg, video_id=None):
279 idstr = u'' if video_id is None else u'%s: ' % video_id
280 self._downloader.report_warning(
281 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
283 def to_screen(self, msg):
284 """Print msg to screen, prefixing it with '[ie_name]'"""
285 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
287 def report_extraction(self, id_or_name):
288 """Report information extraction."""
289 self.to_screen(u'%s: Extracting information' % id_or_name)
291 def report_download_webpage(self, video_id):
292 """Report webpage download."""
293 self.to_screen(u'%s: Downloading webpage' % video_id)
295 def report_age_confirmation(self):
296 """Report attempt to confirm age."""
297 self.to_screen(u'Confirming age')
299 def report_login(self):
300 """Report attempt to log in."""
301 self.to_screen(u'Logging in')
303 #Methods for following #608
305 def url_result(url, ie=None, video_id=None):
306 """Returns a url that points to a page that should be processed"""
307 #TODO: ie should be the class used for getting the info
308 video_info = {'_type': 'url',
311 if video_id is not None:
312 video_info['id'] = video_id
315 def playlist_result(entries, playlist_id=None, playlist_title=None):
316 """Returns a playlist"""
317 video_info = {'_type': 'playlist',
320 video_info['id'] = playlist_id
322 video_info['title'] = playlist_title
325 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
327 Perform a regex search on the given string, using a single or a list of
328 patterns returning the first matching group.
329 In case of failure return a default value or raise a WARNING or a
330 RegexNotFoundError, depending on fatal, specifying the field name.
332 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
333 mobj = re.search(pattern, string, flags)
336 mobj = re.search(p, string, flags)
339 if os.name != 'nt' and sys.stderr.isatty():
340 _name = u'\033[0;34m%s\033[0m' % name
345 # return the first matching group
346 return next(g for g in mobj.groups() if g is not None)
347 elif default is not _NO_DEFAULT:
350 raise RegexNotFoundError(u'Unable to extract %s' % _name)
352 self._downloader.report_warning(u'unable to extract %s; '
353 u'please report this issue on http://yt-dl.org/bug' % _name)
356 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
358 Like _search_regex, but strips HTML tags and unescapes entities.
360 res = self._search_regex(pattern, string, name, default, fatal, flags)
362 return clean_html(res).strip()
366 def _get_login_info(self):
368 Get the the login info as (username, password)
369 It will look in the netrc file using the _NETRC_MACHINE value
370 If there's no info available, return (None, None)
372 if self._downloader is None:
377 downloader_params = self._downloader.params
379 # Attempt to use provided username and password or .netrc data
380 if downloader_params.get('username', None) is not None:
381 username = downloader_params['username']
382 password = downloader_params['password']
383 elif downloader_params.get('usenetrc', False):
385 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
390 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
391 except (IOError, netrc.NetrcParseError) as err:
392 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 return (username, password)
396 # Helper functions for extracting OpenGraph info
398 def _og_regexes(prop):
399 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
400 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
401 template = r'<meta[^>]+?%s[^>]+?%s'
403 template % (property_re, content_re),
404 template % (content_re, property_re),
407 def _og_search_property(self, prop, html, name=None, **kargs):
409 name = 'OpenGraph %s' % prop
410 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
413 return unescapeHTML(escaped)
415 def _og_search_thumbnail(self, html, **kargs):
416 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
418 def _og_search_description(self, html, **kargs):
419 return self._og_search_property('description', html, fatal=False, **kargs)
421 def _og_search_title(self, html, **kargs):
422 return self._og_search_property('title', html, **kargs)
424 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
425 regexes = self._og_regexes('video')
426 if secure: regexes = self._og_regexes('video:secure_url') + regexes
427 return self._html_search_regex(regexes, html, name, **kargs)
429 def _html_search_meta(self, name, html, display_name=None):
430 if display_name is None:
432 return self._html_search_regex(
434 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
435 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
436 html, display_name, fatal=False)
438 def _dc_search_uploader(self, html):
439 return self._html_search_meta('dc.creator', html, 'uploader')
441 def _rta_search(self, html):
442 # See http://www.rtalabel.org/index.php?content=howtofaq#single
443 if re.search(r'(?ix)<meta\s+name="rating"\s+'
444 r' content="RTA-5042-1996-1400-1577-RTA"',
449 def _media_rating_search(self, html):
450 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
451 rating = self._html_search_meta('rating', html)
463 return RATING_TABLE.get(rating.lower(), None)
465 def _sort_formats(self, formats):
467 # TODO remove the following workaround
468 from ..utils import determine_ext
469 if not f.get('ext') and 'url' in f:
470 f['ext'] = determine_ext(f['url'])
472 preference = f.get('preference')
473 if preference is None:
474 proto = f.get('protocol')
476 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
478 preference = 0 if proto in ['http', 'https'] else -0.1
479 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
482 if f.get('vcodec') == 'none': # audio only
483 if self._downloader.params.get('prefer_free_formats'):
484 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
486 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
489 audio_ext_preference = ORDER.index(f['ext'])
491 audio_ext_preference = -1
493 if self._downloader.params.get('prefer_free_formats'):
494 ORDER = [u'flv', u'mp4', u'webm']
496 ORDER = [u'webm', u'flv', u'mp4']
498 ext_preference = ORDER.index(f['ext'])
501 audio_ext_preference = 0
505 f.get('quality') if f.get('quality') is not None else -1,
506 f.get('height') if f.get('height') is not None else -1,
507 f.get('width') if f.get('width') is not None else -1,
509 f.get('tbr') if f.get('tbr') is not None else -1,
510 f.get('vbr') if f.get('vbr') is not None else -1,
511 f.get('abr') if f.get('abr') is not None else -1,
512 audio_ext_preference,
513 f.get('filesize') if f.get('filesize') is not None else -1,
516 formats.sort(key=_formats_key)
519 class SearchInfoExtractor(InfoExtractor):
521 Base class for paged search queries extractors.
522 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
523 Instances should define _SEARCH_KEY and _MAX_RESULTS.
527 def _make_valid_url(cls):
528 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
531 def suitable(cls, url):
532 return re.match(cls._make_valid_url(), url) is not None
534 def _real_extract(self, query):
535 mobj = re.match(self._make_valid_url(), query)
537 raise ExtractorError(u'Invalid search query "%s"' % query)
539 prefix = mobj.group('prefix')
540 query = mobj.group('query')
542 return self._get_n_results(query, 1)
543 elif prefix == 'all':
544 return self._get_n_results(query, self._MAX_RESULTS)
548 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
549 elif n > self._MAX_RESULTS:
550 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
551 n = self._MAX_RESULTS
552 return self._get_n_results(query, n)
554 def _get_n_results(self, query, n):
555 """Get a specified number of results for a query"""
556 raise NotImplementedError("This method must be implemented by subclasses")
559 def SEARCH_KEY(self):
560 return self._SEARCH_KEY