10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
25 _NO_DEFAULT = object()
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
42 title: Video title, unescaped.
44 Additionally, it must contain either a formats entry or a url one:
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * player_url SWF Player URL (used for rtmpdump).
73 * protocol The protocol that will be used for the actual
75 "http", "https", "rtsp", "rtmp", "m3u8" or so.
76 * preference Order number of this format. If this field is
77 present and not None, the formats get sorted
78 by this field, regardless of all other values.
79 -1 for default (order by other properties),
80 -2 or smaller for less than default.
81 * quality Order number of the video quality of this
82 format, irrespective of the file format.
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
86 ext: Video filename extension.
87 format: The video format, defaults to ext (used for --get-format)
88 player_url: SWF Player URL (used for rtmpdump).
90 The following fields are optional:
92 display_id An alternative identifier for the video, not necessarily
93 unique, but available before title. Typically, id is
94 something like "4234987", title "Dancing naked mole rats",
95 and display_id "dancing-naked-mole-rats"
96 thumbnails: A list of dictionaries, with the following entries:
98 * "width" (optional, int)
99 * "height" (optional, int)
100 * "resolution" (optional, string "{width}x{height"},
102 thumbnail: Full URL to a video thumbnail image.
103 description: One-line video description.
104 uploader: Full name of the video uploader.
105 timestamp: UNIX timestamp of the moment the video became available.
106 upload_date: Video upload date (YYYYMMDD).
107 If not explicitly set, calculated from timestamp.
108 uploader_id: Nickname or id of the video uploader.
109 location: Physical location of the video.
110 subtitles: The subtitle file contents as a dictionary in the format
111 {language: subtitles}.
112 duration: Length of the video in seconds, as an integer.
113 view_count: How many users have watched the video on the platform.
114 like_count: Number of positive ratings of the video
115 dislike_count: Number of negative ratings of the video
116 comment_count: Number of comments on the video
117 age_limit: Age restriction for the video, as an integer (years)
118 webpage_url: The url to the video webpage, if given to youtube-dl it
119 should allow to get the same result again. (It will be set
120 by YoutubeDL if it's missing)
121 categories: A list of categories that the video falls in, for example
124 Unless mentioned otherwise, the fields should be Unicode strings.
126 Subclasses of this one should re-define the _real_initialize() and
127 _real_extract() methods and define a _VALID_URL regexp.
128 Probably, they should also be added to the list of extractors.
130 Finally, the _WORKING attribute should be set to False for broken IEs
131 in order to warn the users and skip the tests.
138 def __init__(self, downloader=None):
139 """Constructor. Receives an optional downloader."""
141 self.set_downloader(downloader)
144 def suitable(cls, url):
145 """Receives a URL and returns True if suitable for this IE."""
147 # This does not use has/getattr intentionally - we want to know whether
148 # we have cached the regexp for *this* class, whereas getattr would also
149 # match the superclass
150 if '_VALID_URL_RE' not in cls.__dict__:
151 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
152 return cls._VALID_URL_RE.match(url) is not None
156 """Getter method for _WORKING."""
159 def initialize(self):
160 """Initializes an instance (authentication, etc)."""
162 self._real_initialize()
165 def extract(self, url):
166 """Extracts URL information and returns it in list of dicts."""
168 return self._real_extract(url)
170 def set_downloader(self, downloader):
171 """Sets the downloader for this IE."""
172 self._downloader = downloader
174 def _real_initialize(self):
175 """Real initialization process. Redefine in subclasses."""
178 def _real_extract(self, url):
179 """Real extraction process. Redefine in subclasses."""
184 """A string for getting the InfoExtractor with get_info_extractor"""
185 return cls.__name__[:-2]
189 return type(self).__name__[:-2]
191 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
192 """ Returns the response handle """
194 self.report_download_webpage(video_id)
195 elif note is not False:
197 self.to_screen(u'%s' % (note,))
199 self.to_screen(u'%s: %s' % (video_id, note))
201 return self._downloader.urlopen(url_or_request)
202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
206 errnote = u'Unable to download webpage'
207 errmsg = u'%s: %s' % (errnote, compat_str(err))
209 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
211 self._downloader.report_warning(errmsg)
214 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
215 """ Returns a tuple (page content as string, URL handle) """
217 # Strip hashes from the URL (#1038)
218 if isinstance(url_or_request, (compat_str, str)):
219 url_or_request = url_or_request.partition('#')[0]
221 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
225 content_type = urlh.headers.get('Content-Type', '')
226 webpage_bytes = urlh.read()
227 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
229 encoding = m.group(1)
231 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
232 webpage_bytes[:1024])
234 encoding = m.group(1).decode('ascii')
235 elif webpage_bytes.startswith(b'\xff\xfe'):
239 if self._downloader.params.get('dump_intermediate_pages', False):
241 url = url_or_request.get_full_url()
242 except AttributeError:
244 self.to_screen(u'Dumping request to ' + url)
245 dump = base64.b64encode(webpage_bytes).decode('ascii')
246 self._downloader.to_screen(dump)
247 if self._downloader.params.get('write_pages', False):
249 url = url_or_request.get_full_url()
250 except AttributeError:
252 basen = '%s_%s' % (video_id, url)
254 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
255 basen = basen[:240 - len(h)] + h
256 raw_filename = basen + '.dump'
257 filename = sanitize_filename(raw_filename, restricted=True)
258 self.to_screen(u'Saving request to ' + filename)
259 with open(filename, 'wb') as outf:
260 outf.write(webpage_bytes)
263 content = webpage_bytes.decode(encoding, 'replace')
265 content = webpage_bytes.decode('utf-8', 'replace')
267 if (u'<title>Access to this site is blocked</title>' in content and
268 u'Websense' in content[:512]):
269 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
270 blocked_iframe = self._html_search_regex(
271 r'<iframe src="([^"]+)"', content,
272 u'Websense information URL', default=None)
274 msg += u' Visit %s for more details' % blocked_iframe
275 raise ExtractorError(msg, expected=True)
277 return (content, urlh)
279 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
280 """ Returns the data of the page as a string """
281 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
288 def _download_xml(self, url_or_request, video_id,
289 note=u'Downloading XML', errnote=u'Unable to download XML',
290 transform_source=None, fatal=True):
291 """Return the xml as an xml.etree.ElementTree.Element"""
292 xml_string = self._download_webpage(
293 url_or_request, video_id, note, errnote, fatal=fatal)
294 if xml_string is False:
297 xml_string = transform_source(xml_string)
298 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
300 def _download_json(self, url_or_request, video_id,
301 note=u'Downloading JSON metadata',
302 errnote=u'Unable to download JSON metadata',
303 transform_source=None):
304 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
306 json_string = transform_source(json_string)
308 return json.loads(json_string)
309 except ValueError as ve:
310 raise ExtractorError('Failed to download JSON', cause=ve)
312 def report_warning(self, msg, video_id=None):
313 idstr = u'' if video_id is None else u'%s: ' % video_id
314 self._downloader.report_warning(
315 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
317 def to_screen(self, msg):
318 """Print msg to screen, prefixing it with '[ie_name]'"""
319 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
321 def report_extraction(self, id_or_name):
322 """Report information extraction."""
323 self.to_screen(u'%s: Extracting information' % id_or_name)
325 def report_download_webpage(self, video_id):
326 """Report webpage download."""
327 self.to_screen(u'%s: Downloading webpage' % video_id)
329 def report_age_confirmation(self):
330 """Report attempt to confirm age."""
331 self.to_screen(u'Confirming age')
333 def report_login(self):
334 """Report attempt to log in."""
335 self.to_screen(u'Logging in')
337 #Methods for following #608
339 def url_result(url, ie=None, video_id=None):
340 """Returns a url that points to a page that should be processed"""
341 #TODO: ie should be the class used for getting the info
342 video_info = {'_type': 'url',
345 if video_id is not None:
346 video_info['id'] = video_id
349 def playlist_result(entries, playlist_id=None, playlist_title=None):
350 """Returns a playlist"""
351 video_info = {'_type': 'playlist',
354 video_info['id'] = playlist_id
356 video_info['title'] = playlist_title
359 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
361 Perform a regex search on the given string, using a single or a list of
362 patterns returning the first matching group.
363 In case of failure return a default value or raise a WARNING or a
364 RegexNotFoundError, depending on fatal, specifying the field name.
366 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
367 mobj = re.search(pattern, string, flags)
370 mobj = re.search(p, string, flags)
373 if os.name != 'nt' and sys.stderr.isatty():
374 _name = u'\033[0;34m%s\033[0m' % name
379 # return the first matching group
380 return next(g for g in mobj.groups() if g is not None)
381 elif default is not _NO_DEFAULT:
384 raise RegexNotFoundError(u'Unable to extract %s' % _name)
386 self._downloader.report_warning(u'unable to extract %s; '
387 u'please report this issue on http://yt-dl.org/bug' % _name)
390 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
392 Like _search_regex, but strips HTML tags and unescapes entities.
394 res = self._search_regex(pattern, string, name, default, fatal, flags)
396 return clean_html(res).strip()
400 def _get_login_info(self):
402 Get the the login info as (username, password)
403 It will look in the netrc file using the _NETRC_MACHINE value
404 If there's no info available, return (None, None)
406 if self._downloader is None:
411 downloader_params = self._downloader.params
413 # Attempt to use provided username and password or .netrc data
414 if downloader_params.get('username', None) is not None:
415 username = downloader_params['username']
416 password = downloader_params['password']
417 elif downloader_params.get('usenetrc', False):
419 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
424 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
425 except (IOError, netrc.NetrcParseError) as err:
426 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
428 return (username, password)
430 # Helper functions for extracting OpenGraph info
432 def _og_regexes(prop):
433 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
434 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
435 template = r'<meta[^>]+?%s[^>]+?%s'
437 template % (property_re, content_re),
438 template % (content_re, property_re),
441 def _og_search_property(self, prop, html, name=None, **kargs):
443 name = 'OpenGraph %s' % prop
444 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
447 return unescapeHTML(escaped)
449 def _og_search_thumbnail(self, html, **kargs):
450 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
452 def _og_search_description(self, html, **kargs):
453 return self._og_search_property('description', html, fatal=False, **kargs)
455 def _og_search_title(self, html, **kargs):
456 return self._og_search_property('title', html, **kargs)
458 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
459 regexes = self._og_regexes('video')
460 if secure: regexes = self._og_regexes('video:secure_url') + regexes
461 return self._html_search_regex(regexes, html, name, **kargs)
463 def _og_search_url(self, html, **kargs):
464 return self._og_search_property('url', html, **kargs)
466 def _html_search_meta(self, name, html, display_name=None, fatal=False):
467 if display_name is None:
469 return self._html_search_regex(
471 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
472 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
473 html, display_name, fatal=fatal)
475 def _dc_search_uploader(self, html):
476 return self._html_search_meta('dc.creator', html, 'uploader')
478 def _rta_search(self, html):
479 # See http://www.rtalabel.org/index.php?content=howtofaq#single
480 if re.search(r'(?ix)<meta\s+name="rating"\s+'
481 r' content="RTA-5042-1996-1400-1577-RTA"',
486 def _media_rating_search(self, html):
487 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
488 rating = self._html_search_meta('rating', html)
500 return RATING_TABLE.get(rating.lower(), None)
502 def _twitter_search_player(self, html):
503 return self._html_search_meta('twitter:player', html,
504 'twitter card player')
506 def _sort_formats(self, formats):
508 raise ExtractorError(u'No video formats found')
511 # TODO remove the following workaround
512 from ..utils import determine_ext
513 if not f.get('ext') and 'url' in f:
514 f['ext'] = determine_ext(f['url'])
516 preference = f.get('preference')
517 if preference is None:
518 proto = f.get('protocol')
520 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
522 preference = 0 if proto in ['http', 'https'] else -0.1
523 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
526 if f.get('vcodec') == 'none': # audio only
527 if self._downloader.params.get('prefer_free_formats'):
528 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
530 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
533 audio_ext_preference = ORDER.index(f['ext'])
535 audio_ext_preference = -1
537 if self._downloader.params.get('prefer_free_formats'):
538 ORDER = [u'flv', u'mp4', u'webm']
540 ORDER = [u'webm', u'flv', u'mp4']
542 ext_preference = ORDER.index(f['ext'])
545 audio_ext_preference = 0
549 f.get('quality') if f.get('quality') is not None else -1,
550 f.get('height') if f.get('height') is not None else -1,
551 f.get('width') if f.get('width') is not None else -1,
553 f.get('tbr') if f.get('tbr') is not None else -1,
554 f.get('vbr') if f.get('vbr') is not None else -1,
555 f.get('abr') if f.get('abr') is not None else -1,
556 audio_ext_preference,
557 f.get('filesize') if f.get('filesize') is not None else -1,
560 formats.sort(key=_formats_key)
562 def http_scheme(self):
563 """ Either "https:" or "https:", depending on the user's preferences """
566 if self._downloader.params.get('prefer_insecure', False)
569 def _proto_relative_url(self, url, scheme=None):
572 if url.startswith('//'):
574 scheme = self.http_scheme()
579 def _sleep(self, timeout, video_id, msg_template=None):
580 if msg_template is None:
581 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
582 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
587 class SearchInfoExtractor(InfoExtractor):
589 Base class for paged search queries extractors.
590 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
591 Instances should define _SEARCH_KEY and _MAX_RESULTS.
595 def _make_valid_url(cls):
596 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
599 def suitable(cls, url):
600 return re.match(cls._make_valid_url(), url) is not None
602 def _real_extract(self, query):
603 mobj = re.match(self._make_valid_url(), query)
605 raise ExtractorError(u'Invalid search query "%s"' % query)
607 prefix = mobj.group('prefix')
608 query = mobj.group('query')
610 return self._get_n_results(query, 1)
611 elif prefix == 'all':
612 return self._get_n_results(query, self._MAX_RESULTS)
616 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
617 elif n > self._MAX_RESULTS:
618 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
619 n = self._MAX_RESULTS
620 return self._get_n_results(query, n)
622 def _get_n_results(self, query, n):
623 """Get a specified number of results for a query"""
624 raise NotImplementedError("This method must be implemented by subclasses")
627 def SEARCH_KEY(self):
628 return self._SEARCH_KEY