9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
77 by this field, regardless of all other values.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
89 The following fields are optional:
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries (with the entries "resolution" and
96 "url") for the varying thumbnails
97 thumbnail: Full URL to a video thumbnail image.
98 description: One-line video description.
99 uploader: Full name of the video uploader.
100 timestamp: UNIX timestamp of the moment the video became available.
101 upload_date: Video upload date (YYYYMMDD).
102 If not explicitly set, calculated from timestamp.
103 uploader_id: Nickname or id of the video uploader.
104 location: Physical location of the video.
105 subtitles: The subtitle file contents as a dictionary in the format
106 {language: subtitles}.
107 duration: Length of the video in seconds, as an integer.
108 view_count: How many users have watched the video on the platform.
109 like_count: Number of positive ratings of the video
110 dislike_count: Number of negative ratings of the video
111 comment_count: Number of comments on the video
112 age_limit: Age restriction for the video, as an integer (years)
113 webpage_url: The url to the video webpage, if given to youtube-dl it
114 should allow to get the same result again. (It will be set
115 by YoutubeDL if it's missing)
116 categories: A list of categories that the video falls in, for example
119 Unless mentioned otherwise, the fields should be Unicode strings.
121 Subclasses of this one should re-define the _real_initialize() and
122 _real_extract() methods and define a _VALID_URL regexp.
123 Probably, they should also be added to the list of extractors.
125 Finally, the _WORKING attribute should be set to False for broken IEs
126 in order to warn the users and skip the tests.
133 def __init__(self, downloader=None):
134 """Constructor. Receives an optional downloader."""
136 self.set_downloader(downloader)
139 def suitable(cls, url):
140 """Receives a URL and returns True if suitable for this IE."""
142 # This does not use has/getattr intentionally - we want to know whether
143 # we have cached the regexp for *this* class, whereas getattr would also
144 # match the superclass
145 if '_VALID_URL_RE' not in cls.__dict__:
146 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
147 return cls._VALID_URL_RE.match(url) is not None
151 """Getter method for _WORKING."""
154 def initialize(self):
155 """Initializes an instance (authentication, etc)."""
157 self._real_initialize()
160 def extract(self, url):
161 """Extracts URL information and returns it in list of dicts."""
163 return self._real_extract(url)
165 def set_downloader(self, downloader):
166 """Sets the downloader for this IE."""
167 self._downloader = downloader
169 def _real_initialize(self):
170 """Real initialization process. Redefine in subclasses."""
173 def _real_extract(self, url):
174 """Real extraction process. Redefine in subclasses."""
179 """A string for getting the InfoExtractor with get_info_extractor"""
180 return cls.__name__[:-2]
184 return type(self).__name__[:-2]
186 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
187 """ Returns the response handle """
189 self.report_download_webpage(video_id)
190 elif note is not False:
192 self.to_screen(u'%s' % (note,))
194 self.to_screen(u'%s: %s' % (video_id, note))
196 return self._downloader.urlopen(url_or_request)
197 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
201 errnote = u'Unable to download webpage'
202 errmsg = u'%s: %s' % (errnote, compat_str(err))
204 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
206 self._downloader.report_warning(errmsg)
209 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
210 """ Returns a tuple (page content as string, URL handle) """
212 # Strip hashes from the URL (#1038)
213 if isinstance(url_or_request, (compat_str, str)):
214 url_or_request = url_or_request.partition('#')[0]
216 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
220 content_type = urlh.headers.get('Content-Type', '')
221 webpage_bytes = urlh.read()
222 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
224 encoding = m.group(1)
226 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
227 webpage_bytes[:1024])
229 encoding = m.group(1).decode('ascii')
230 elif webpage_bytes.startswith(b'\xff\xfe'):
234 if self._downloader.params.get('dump_intermediate_pages', False):
236 url = url_or_request.get_full_url()
237 except AttributeError:
239 self.to_screen(u'Dumping request to ' + url)
240 dump = base64.b64encode(webpage_bytes).decode('ascii')
241 self._downloader.to_screen(dump)
242 if self._downloader.params.get('write_pages', False):
244 url = url_or_request.get_full_url()
245 except AttributeError:
247 basen = '%s_%s' % (video_id, url)
249 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
250 basen = basen[:240 - len(h)] + h
251 raw_filename = basen + '.dump'
252 filename = sanitize_filename(raw_filename, restricted=True)
253 self.to_screen(u'Saving request to ' + filename)
254 with open(filename, 'wb') as outf:
255 outf.write(webpage_bytes)
258 content = webpage_bytes.decode(encoding, 'replace')
260 content = webpage_bytes.decode('utf-8', 'replace')
262 if (u'<title>Access to this site is blocked</title>' in content and
263 u'Websense' in content[:512]):
264 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
265 blocked_iframe = self._html_search_regex(
266 r'<iframe src="([^"]+)"', content,
267 u'Websense information URL', default=None)
269 msg += u' Visit %s for more details' % blocked_iframe
270 raise ExtractorError(msg, expected=True)
272 return (content, urlh)
274 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
275 """ Returns the data of the page as a string """
276 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
283 def _download_xml(self, url_or_request, video_id,
284 note=u'Downloading XML', errnote=u'Unable to download XML',
285 transform_source=None, fatal=True):
286 """Return the xml as an xml.etree.ElementTree.Element"""
287 xml_string = self._download_webpage(
288 url_or_request, video_id, note, errnote, fatal=fatal)
289 if xml_string is False:
292 xml_string = transform_source(xml_string)
293 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
295 def _download_json(self, url_or_request, video_id,
296 note=u'Downloading JSON metadata',
297 errnote=u'Unable to download JSON metadata',
298 transform_source=None):
299 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
301 json_string = transform_source(json_string)
303 return json.loads(json_string)
304 except ValueError as ve:
305 raise ExtractorError('Failed to download JSON', cause=ve)
307 def report_warning(self, msg, video_id=None):
308 idstr = u'' if video_id is None else u'%s: ' % video_id
309 self._downloader.report_warning(
310 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
312 def to_screen(self, msg):
313 """Print msg to screen, prefixing it with '[ie_name]'"""
314 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
316 def report_extraction(self, id_or_name):
317 """Report information extraction."""
318 self.to_screen(u'%s: Extracting information' % id_or_name)
320 def report_download_webpage(self, video_id):
321 """Report webpage download."""
322 self.to_screen(u'%s: Downloading webpage' % video_id)
324 def report_age_confirmation(self):
325 """Report attempt to confirm age."""
326 self.to_screen(u'Confirming age')
328 def report_login(self):
329 """Report attempt to log in."""
330 self.to_screen(u'Logging in')
332 #Methods for following #608
334 def url_result(url, ie=None, video_id=None):
335 """Returns a url that points to a page that should be processed"""
336 #TODO: ie should be the class used for getting the info
337 video_info = {'_type': 'url',
340 if video_id is not None:
341 video_info['id'] = video_id
344 def playlist_result(entries, playlist_id=None, playlist_title=None):
345 """Returns a playlist"""
346 # Ensure we don't have any duplicates in the playlist
350 theurl = tuple(url.items())
351 if theurl not in seen:
356 video_info = {'_type': 'playlist',
359 video_info['id'] = playlist_id
361 video_info['title'] = playlist_title
364 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
366 Perform a regex search on the given string, using a single or a list of
367 patterns returning the first matching group.
368 In case of failure return a default value or raise a WARNING or a
369 RegexNotFoundError, depending on fatal, specifying the field name.
371 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
372 mobj = re.search(pattern, string, flags)
375 mobj = re.search(p, string, flags)
378 if os.name != 'nt' and sys.stderr.isatty():
379 _name = u'\033[0;34m%s\033[0m' % name
384 # return the first matching group
385 return next(g for g in mobj.groups() if g is not None)
386 elif default is not _NO_DEFAULT:
389 raise RegexNotFoundError(u'Unable to extract %s' % _name)
391 self._downloader.report_warning(u'unable to extract %s; '
392 u'please report this issue on http://yt-dl.org/bug' % _name)
395 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
397 Like _search_regex, but strips HTML tags and unescapes entities.
399 res = self._search_regex(pattern, string, name, default, fatal, flags)
401 return clean_html(res).strip()
405 def _get_login_info(self):
407 Get the the login info as (username, password)
408 It will look in the netrc file using the _NETRC_MACHINE value
409 If there's no info available, return (None, None)
411 if self._downloader is None:
416 downloader_params = self._downloader.params
418 # Attempt to use provided username and password or .netrc data
419 if downloader_params.get('username', None) is not None:
420 username = downloader_params['username']
421 password = downloader_params['password']
422 elif downloader_params.get('usenetrc', False):
424 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
429 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
430 except (IOError, netrc.NetrcParseError) as err:
431 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
433 return (username, password)
435 # Helper functions for extracting OpenGraph info
437 def _og_regexes(prop):
438 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
439 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
440 template = r'<meta[^>]+?%s[^>]+?%s'
442 template % (property_re, content_re),
443 template % (content_re, property_re),
446 def _og_search_property(self, prop, html, name=None, **kargs):
448 name = 'OpenGraph %s' % prop
449 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
452 return unescapeHTML(escaped)
454 def _og_search_thumbnail(self, html, **kargs):
455 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
457 def _og_search_description(self, html, **kargs):
458 return self._og_search_property('description', html, fatal=False, **kargs)
460 def _og_search_title(self, html, **kargs):
461 return self._og_search_property('title', html, **kargs)
463 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
464 regexes = self._og_regexes('video')
465 if secure: regexes = self._og_regexes('video:secure_url') + regexes
466 return self._html_search_regex(regexes, html, name, **kargs)
468 def _html_search_meta(self, name, html, display_name=None, fatal=False):
469 if display_name is None:
471 return self._html_search_regex(
473 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
474 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
475 html, display_name, fatal=fatal)
477 def _dc_search_uploader(self, html):
478 return self._html_search_meta('dc.creator', html, 'uploader')
480 def _rta_search(self, html):
481 # See http://www.rtalabel.org/index.php?content=howtofaq#single
482 if re.search(r'(?ix)<meta\s+name="rating"\s+'
483 r' content="RTA-5042-1996-1400-1577-RTA"',
488 def _media_rating_search(self, html):
489 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
490 rating = self._html_search_meta('rating', html)
502 return RATING_TABLE.get(rating.lower(), None)
504 def _twitter_search_player(self, html):
505 return self._html_search_meta('twitter:player', html,
506 'twitter card player')
508 def _sort_formats(self, formats):
510 raise ExtractorError(u'No video formats found')
513 # TODO remove the following workaround
514 from ..utils import determine_ext
515 if not f.get('ext') and 'url' in f:
516 f['ext'] = determine_ext(f['url'])
518 preference = f.get('preference')
519 if preference is None:
520 proto = f.get('protocol')
522 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
524 preference = 0 if proto in ['http', 'https'] else -0.1
525 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
528 if f.get('vcodec') == 'none': # audio only
529 if self._downloader.params.get('prefer_free_formats'):
530 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
532 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
535 audio_ext_preference = ORDER.index(f['ext'])
537 audio_ext_preference = -1
539 if self._downloader.params.get('prefer_free_formats'):
540 ORDER = [u'flv', u'mp4', u'webm']
542 ORDER = [u'webm', u'flv', u'mp4']
544 ext_preference = ORDER.index(f['ext'])
547 audio_ext_preference = 0
551 f.get('quality') if f.get('quality') is not None else -1,
552 f.get('height') if f.get('height') is not None else -1,
553 f.get('width') if f.get('width') is not None else -1,
555 f.get('tbr') if f.get('tbr') is not None else -1,
556 f.get('vbr') if f.get('vbr') is not None else -1,
557 f.get('abr') if f.get('abr') is not None else -1,
558 audio_ext_preference,
559 f.get('filesize') if f.get('filesize') is not None else -1,
562 formats.sort(key=_formats_key)
564 def http_scheme(self):
565 """ Either "https:" or "https:", depending on the user's preferences """
568 if self._downloader.params.get('prefer_insecure', False)
571 def _proto_relative_url(self, url, scheme=None):
574 if url.startswith('//'):
576 scheme = self.http_scheme()
582 class SearchInfoExtractor(InfoExtractor):
584 Base class for paged search queries extractors.
585 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
586 Instances should define _SEARCH_KEY and _MAX_RESULTS.
590 def _make_valid_url(cls):
591 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
594 def suitable(cls, url):
595 return re.match(cls._make_valid_url(), url) is not None
597 def _real_extract(self, query):
598 mobj = re.match(self._make_valid_url(), query)
600 raise ExtractorError(u'Invalid search query "%s"' % query)
602 prefix = mobj.group('prefix')
603 query = mobj.group('query')
605 return self._get_n_results(query, 1)
606 elif prefix == 'all':
607 return self._get_n_results(query, self._MAX_RESULTS)
611 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
612 elif n > self._MAX_RESULTS:
613 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
614 n = self._MAX_RESULTS
615 return self._get_n_results(query, n)
617 def _get_n_results(self, query, n):
618 """Get a specified number of results for a query"""
619 raise NotImplementedError("This method must be implemented by subclasses")
622 def SEARCH_KEY(self):
623 return self._SEARCH_KEY