9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
89 The following fields are optional:
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries (with the entries "resolution" and
96 "url") for the varying thumbnails
97 thumbnail: Full URL to a video thumbnail image.
98 description: One-line video description.
99 uploader: Full name of the video uploader.
100 upload_date: Video upload date (YYYYMMDD).
101 uploader_id: Nickname or id of the video uploader.
102 location: Physical location of the video.
103 subtitles: The subtitle file contents as a dictionary in the format
104 {language: subtitles}.
105 duration: Length of the video in seconds, as an integer.
106 view_count: How many users have watched the video on the platform.
107 like_count: Number of positive ratings of the video
108 dislike_count: Number of negative ratings of the video
109 comment_count: Number of comments on the video
110 age_limit: Age restriction for the video, as an integer (years)
111 webpage_url: The url to the video webpage, if given to youtube-dl it
112 should allow to get the same result again. (It will be set
113 by YoutubeDL if it's missing)
115 Unless mentioned otherwise, the fields should be Unicode strings.
117 Subclasses of this one should re-define the _real_initialize() and
118 _real_extract() methods and define a _VALID_URL regexp.
119 Probably, they should also be added to the list of extractors.
121 _real_extract() must return a *list* of information dictionaries as
124 Finally, the _WORKING attribute should be set to False for broken IEs
125 in order to warn the users and skip the tests.
132 def __init__(self, downloader=None):
133 """Constructor. Receives an optional downloader."""
135 self.set_downloader(downloader)
138 def suitable(cls, url):
139 """Receives a URL and returns True if suitable for this IE."""
141 # This does not use has/getattr intentionally - we want to know whether
142 # we have cached the regexp for *this* class, whereas getattr would also
143 # match the superclass
144 if '_VALID_URL_RE' not in cls.__dict__:
145 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
146 return cls._VALID_URL_RE.match(url) is not None
150 """Getter method for _WORKING."""
153 def initialize(self):
154 """Initializes an instance (authentication, etc)."""
156 self._real_initialize()
159 def extract(self, url):
160 """Extracts URL information and returns it in list of dicts."""
162 return self._real_extract(url)
164 def set_downloader(self, downloader):
165 """Sets the downloader for this IE."""
166 self._downloader = downloader
168 def _real_initialize(self):
169 """Real initialization process. Redefine in subclasses."""
172 def _real_extract(self, url):
173 """Real extraction process. Redefine in subclasses."""
178 """A string for getting the InfoExtractor with get_info_extractor"""
179 return cls.__name__[:-2]
183 return type(self).__name__[:-2]
185 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
186 """ Returns the response handle """
188 self.report_download_webpage(video_id)
189 elif note is not False:
191 self.to_screen(u'%s' % (note,))
193 self.to_screen(u'%s: %s' % (video_id, note))
195 return self._downloader.urlopen(url_or_request)
196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
200 errnote = u'Unable to download webpage'
201 errmsg = u'%s: %s' % (errnote, compat_str(err))
203 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
205 self._downloader.report_warning(errmsg)
208 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
209 """ Returns a tuple (page content as string, URL handle) """
211 # Strip hashes from the URL (#1038)
212 if isinstance(url_or_request, (compat_str, str)):
213 url_or_request = url_or_request.partition('#')[0]
215 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
219 content_type = urlh.headers.get('Content-Type', '')
220 webpage_bytes = urlh.read()
221 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
223 encoding = m.group(1)
225 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
226 webpage_bytes[:1024])
228 encoding = m.group(1).decode('ascii')
229 elif webpage_bytes.startswith(b'\xff\xfe'):
233 if self._downloader.params.get('dump_intermediate_pages', False):
235 url = url_or_request.get_full_url()
236 except AttributeError:
238 self.to_screen(u'Dumping request to ' + url)
239 dump = base64.b64encode(webpage_bytes).decode('ascii')
240 self._downloader.to_screen(dump)
241 if self._downloader.params.get('write_pages', False):
243 url = url_or_request.get_full_url()
244 except AttributeError:
247 h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
248 url = url[:200 - len(h)] + h
249 raw_filename = ('%s_%s.dump' % (video_id, url))
250 filename = sanitize_filename(raw_filename, restricted=True)
251 self.to_screen(u'Saving request to ' + filename)
252 with open(filename, 'wb') as outf:
253 outf.write(webpage_bytes)
255 content = webpage_bytes.decode(encoding, 'replace')
256 return (content, urlh)
258 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
259 """ Returns the data of the page as a string """
260 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
267 def _download_xml(self, url_or_request, video_id,
268 note=u'Downloading XML', errnote=u'Unable to download XML',
269 transform_source=None):
270 """Return the xml as an xml.etree.ElementTree.Element"""
271 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
273 xml_string = transform_source(xml_string)
274 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
276 def _download_json(self, url_or_request, video_id,
277 note=u'Downloading JSON metadata',
278 errnote=u'Unable to download JSON metadata',
279 transform_source=None):
280 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
282 json_string = transform_source(json_string)
284 return json.loads(json_string)
285 except ValueError as ve:
286 raise ExtractorError('Failed to download JSON', cause=ve)
288 def report_warning(self, msg, video_id=None):
289 idstr = u'' if video_id is None else u'%s: ' % video_id
290 self._downloader.report_warning(
291 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
293 def to_screen(self, msg):
294 """Print msg to screen, prefixing it with '[ie_name]'"""
295 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
297 def report_extraction(self, id_or_name):
298 """Report information extraction."""
299 self.to_screen(u'%s: Extracting information' % id_or_name)
301 def report_download_webpage(self, video_id):
302 """Report webpage download."""
303 self.to_screen(u'%s: Downloading webpage' % video_id)
305 def report_age_confirmation(self):
306 """Report attempt to confirm age."""
307 self.to_screen(u'Confirming age')
309 def report_login(self):
310 """Report attempt to log in."""
311 self.to_screen(u'Logging in')
313 #Methods for following #608
315 def url_result(url, ie=None, video_id=None):
316 """Returns a url that points to a page that should be processed"""
317 #TODO: ie should be the class used for getting the info
318 video_info = {'_type': 'url',
321 if video_id is not None:
322 video_info['id'] = video_id
325 def playlist_result(entries, playlist_id=None, playlist_title=None):
326 """Returns a playlist"""
327 video_info = {'_type': 'playlist',
330 video_info['id'] = playlist_id
332 video_info['title'] = playlist_title
335 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
337 Perform a regex search on the given string, using a single or a list of
338 patterns returning the first matching group.
339 In case of failure return a default value or raise a WARNING or a
340 RegexNotFoundError, depending on fatal, specifying the field name.
342 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
343 mobj = re.search(pattern, string, flags)
346 mobj = re.search(p, string, flags)
349 if os.name != 'nt' and sys.stderr.isatty():
350 _name = u'\033[0;34m%s\033[0m' % name
355 # return the first matching group
356 return next(g for g in mobj.groups() if g is not None)
357 elif default is not _NO_DEFAULT:
360 raise RegexNotFoundError(u'Unable to extract %s' % _name)
362 self._downloader.report_warning(u'unable to extract %s; '
363 u'please report this issue on http://yt-dl.org/bug' % _name)
366 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
368 Like _search_regex, but strips HTML tags and unescapes entities.
370 res = self._search_regex(pattern, string, name, default, fatal, flags)
372 return clean_html(res).strip()
376 def _get_login_info(self):
378 Get the the login info as (username, password)
379 It will look in the netrc file using the _NETRC_MACHINE value
380 If there's no info available, return (None, None)
382 if self._downloader is None:
387 downloader_params = self._downloader.params
389 # Attempt to use provided username and password or .netrc data
390 if downloader_params.get('username', None) is not None:
391 username = downloader_params['username']
392 password = downloader_params['password']
393 elif downloader_params.get('usenetrc', False):
395 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
400 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
401 except (IOError, netrc.NetrcParseError) as err:
402 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
404 return (username, password)
406 # Helper functions for extracting OpenGraph info
408 def _og_regexes(prop):
409 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
410 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
411 template = r'<meta[^>]+?%s[^>]+?%s'
413 template % (property_re, content_re),
414 template % (content_re, property_re),
417 def _og_search_property(self, prop, html, name=None, **kargs):
419 name = 'OpenGraph %s' % prop
420 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
423 return unescapeHTML(escaped)
425 def _og_search_thumbnail(self, html, **kargs):
426 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
428 def _og_search_description(self, html, **kargs):
429 return self._og_search_property('description', html, fatal=False, **kargs)
431 def _og_search_title(self, html, **kargs):
432 return self._og_search_property('title', html, **kargs)
434 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
435 regexes = self._og_regexes('video')
436 if secure: regexes = self._og_regexes('video:secure_url') + regexes
437 return self._html_search_regex(regexes, html, name, **kargs)
439 def _html_search_meta(self, name, html, display_name=None):
440 if display_name is None:
442 return self._html_search_regex(
444 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
445 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
446 html, display_name, fatal=False)
448 def _dc_search_uploader(self, html):
449 return self._html_search_meta('dc.creator', html, 'uploader')
451 def _rta_search(self, html):
452 # See http://www.rtalabel.org/index.php?content=howtofaq#single
453 if re.search(r'(?ix)<meta\s+name="rating"\s+'
454 r' content="RTA-5042-1996-1400-1577-RTA"',
459 def _media_rating_search(self, html):
460 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
461 rating = self._html_search_meta('rating', html)
473 return RATING_TABLE.get(rating.lower(), None)
475 def _twitter_search_player(self, html):
476 return self._html_search_meta('twitter:player', html,
477 'twitter card player')
479 def _sort_formats(self, formats):
481 raise ExtractorError(u'No video formats found')
484 # TODO remove the following workaround
485 from ..utils import determine_ext
486 if not f.get('ext') and 'url' in f:
487 f['ext'] = determine_ext(f['url'])
489 preference = f.get('preference')
490 if preference is None:
491 proto = f.get('protocol')
493 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
495 preference = 0 if proto in ['http', 'https'] else -0.1
496 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
499 if f.get('vcodec') == 'none': # audio only
500 if self._downloader.params.get('prefer_free_formats'):
501 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
503 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
506 audio_ext_preference = ORDER.index(f['ext'])
508 audio_ext_preference = -1
510 if self._downloader.params.get('prefer_free_formats'):
511 ORDER = [u'flv', u'mp4', u'webm']
513 ORDER = [u'webm', u'flv', u'mp4']
515 ext_preference = ORDER.index(f['ext'])
518 audio_ext_preference = 0
522 f.get('quality') if f.get('quality') is not None else -1,
523 f.get('height') if f.get('height') is not None else -1,
524 f.get('width') if f.get('width') is not None else -1,
526 f.get('tbr') if f.get('tbr') is not None else -1,
527 f.get('vbr') if f.get('vbr') is not None else -1,
528 f.get('abr') if f.get('abr') is not None else -1,
529 audio_ext_preference,
530 f.get('filesize') if f.get('filesize') is not None else -1,
533 formats.sort(key=_formats_key)
536 class SearchInfoExtractor(InfoExtractor):
538 Base class for paged search queries extractors.
539 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
540 Instances should define _SEARCH_KEY and _MAX_RESULTS.
544 def _make_valid_url(cls):
545 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
548 def suitable(cls, url):
549 return re.match(cls._make_valid_url(), url) is not None
551 def _real_extract(self, query):
552 mobj = re.match(self._make_valid_url(), query)
554 raise ExtractorError(u'Invalid search query "%s"' % query)
556 prefix = mobj.group('prefix')
557 query = mobj.group('query')
559 return self._get_n_results(query, 1)
560 elif prefix == 'all':
561 return self._get_n_results(query, self._MAX_RESULTS)
565 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
566 elif n > self._MAX_RESULTS:
567 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
568 n = self._MAX_RESULTS
569 return self._get_n_results(query, n)
571 def _get_n_results(self, query, n):
572 """Get a specified number of results for a query"""
573 raise NotImplementedError("This method must be implemented by subclasses")
576 def SEARCH_KEY(self):
577 return self._SEARCH_KEY