1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
18 compat_urllib_parse_urlparse,
32 _NO_DEFAULT = object()
35 class InfoExtractor(object):
36 """Information Extractor class.
38 Information extractors are the classes that, given a URL, extract
39 information about the video (or videos) the URL refers to. This
40 information includes the real video URL, the video title, author and
41 others. The information is stored in a dictionary which is then
42 passed to the FileDownloader. The FileDownloader processes this
43 information possibly downloading the video to the file system, among
44 other possible outcomes.
46 The dictionaries must include the following fields:
49 title: Video title, unescaped.
51 Additionally, it must contain either a formats entry or a url one:
53 formats: A list of dictionaries for each format available, ordered
54 from worst to best quality.
57 * url Mandatory. The URL of the video file
58 * ext Will be calculated from url if missing
59 * format A human-readable description of the format
60 ("mp4 container with h264/opus").
61 Calculated from the format_id, width, height.
62 and format_note fields if missing.
63 * format_id A short description of the format
64 ("mp4_h264_opus" or "19").
65 Technically optional, but strongly recommended.
66 * format_note Additional info about the format
67 ("3D" or "DASH video")
68 * width Width of the video, if known
69 * height Height of the video, if known
70 * resolution Textual description of width and height
71 * tbr Average bitrate of audio and video in KBit/s
72 * abr Average audio bitrate in KBit/s
73 * acodec Name of the audio codec in use
74 * asr Audio sampling rate in Hertz
75 * vbr Average video bitrate in KBit/s
77 * vcodec Name of the video codec in use
78 * container Name of the container format
79 * filesize The number of bytes, if known in advance
80 * filesize_approx An estimate for the number of bytes
81 * player_url SWF Player URL (used for rtmpdump).
82 * protocol The protocol that will be used for the actual
84 "http", "https", "rtsp", "rtmp", "m3u8" or so.
85 * preference Order number of this format. If this field is
86 present and not None, the formats get sorted
87 by this field, regardless of all other values.
88 -1 for default (order by other properties),
89 -2 or smaller for less than default.
90 * quality Order number of the video quality of this
91 format, irrespective of the file format.
92 -1 for default (order by other properties),
93 -2 or smaller for less than default.
94 * source_preference Order number for this video source
95 (quality takes higher priority)
96 -1 for default (order by other properties),
97 -2 or smaller for less than default.
98 * http_referer HTTP Referer header value to set.
99 * http_method HTTP method to use for the download.
100 * http_headers A dictionary of additional HTTP headers
101 to add to the request.
102 * http_post_data Additional data to send with a POST
104 url: Final video URL.
105 ext: Video filename extension.
106 format: The video format, defaults to ext (used for --get-format)
107 player_url: SWF Player URL (used for rtmpdump).
109 The following fields are optional:
111 display_id An alternative identifier for the video, not necessarily
112 unique, but available before title. Typically, id is
113 something like "4234987", title "Dancing naked mole rats",
114 and display_id "dancing-naked-mole-rats"
115 thumbnails: A list of dictionaries, with the following entries:
117 * "width" (optional, int)
118 * "height" (optional, int)
119 * "resolution" (optional, string "{width}x{height"},
121 thumbnail: Full URL to a video thumbnail image.
122 description: One-line video description.
123 uploader: Full name of the video uploader.
124 timestamp: UNIX timestamp of the moment the video became available.
125 upload_date: Video upload date (YYYYMMDD).
126 If not explicitly set, calculated from timestamp.
127 uploader_id: Nickname or id of the video uploader.
128 location: Physical location where the video was filmed.
129 subtitles: The subtitle file contents as a dictionary in the format
130 {language: subtitles}.
131 duration: Length of the video in seconds, as an integer.
132 view_count: How many users have watched the video on the platform.
133 like_count: Number of positive ratings of the video
134 dislike_count: Number of negative ratings of the video
135 comment_count: Number of comments on the video
136 age_limit: Age restriction for the video, as an integer (years)
137 webpage_url: The url to the video webpage, if given to youtube-dl it
138 should allow to get the same result again. (It will be set
139 by YoutubeDL if it's missing)
140 categories: A list of categories that the video falls in, for example
142 is_live: True, False, or None (=unknown). Whether this video is a
143 live stream that goes on instead of a fixed-length video.
145 Unless mentioned otherwise, the fields should be Unicode strings.
147 Unless mentioned otherwise, None is equivalent to absence of information.
149 Subclasses of this one should re-define the _real_initialize() and
150 _real_extract() methods and define a _VALID_URL regexp.
151 Probably, they should also be added to the list of extractors.
153 Finally, the _WORKING attribute should be set to False for broken IEs
154 in order to warn the users and skip the tests.
161 def __init__(self, downloader=None):
162 """Constructor. Receives an optional downloader."""
164 self.set_downloader(downloader)
167 def suitable(cls, url):
168 """Receives a URL and returns True if suitable for this IE."""
170 # This does not use has/getattr intentionally - we want to know whether
171 # we have cached the regexp for *this* class, whereas getattr would also
172 # match the superclass
173 if '_VALID_URL_RE' not in cls.__dict__:
174 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
175 return cls._VALID_URL_RE.match(url) is not None
178 def _match_id(cls, url):
179 if '_VALID_URL_RE' not in cls.__dict__:
180 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
181 m = cls._VALID_URL_RE.match(url)
187 """Getter method for _WORKING."""
190 def initialize(self):
191 """Initializes an instance (authentication, etc)."""
193 self._real_initialize()
196 def extract(self, url):
197 """Extracts URL information and returns it in list of dicts."""
199 return self._real_extract(url)
201 def set_downloader(self, downloader):
202 """Sets the downloader for this IE."""
203 self._downloader = downloader
205 def _real_initialize(self):
206 """Real initialization process. Redefine in subclasses."""
209 def _real_extract(self, url):
210 """Real extraction process. Redefine in subclasses."""
215 """A string for getting the InfoExtractor with get_info_extractor"""
216 return cls.__name__[:-2]
220 return type(self).__name__[:-2]
222 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
223 """ Returns the response handle """
225 self.report_download_webpage(video_id)
226 elif note is not False:
228 self.to_screen('%s' % (note,))
230 self.to_screen('%s: %s' % (video_id, note))
232 return self._downloader.urlopen(url_or_request)
233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
237 errnote = 'Unable to download webpage'
238 errmsg = '%s: %s' % (errnote, compat_str(err))
240 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
242 self._downloader.report_warning(errmsg)
245 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
246 """ Returns a tuple (page content as string, URL handle) """
247 # Strip hashes from the URL (#1038)
248 if isinstance(url_or_request, (compat_str, str)):
249 url_or_request = url_or_request.partition('#')[0]
251 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
255 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
256 return (content, urlh)
258 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
259 content_type = urlh.headers.get('Content-Type', '')
260 webpage_bytes = urlh.read()
261 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
263 encoding = m.group(1)
265 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
266 webpage_bytes[:1024])
268 encoding = m.group(1).decode('ascii')
269 elif webpage_bytes.startswith(b'\xff\xfe'):
273 if self._downloader.params.get('dump_intermediate_pages', False):
275 url = url_or_request.get_full_url()
276 except AttributeError:
278 self.to_screen('Dumping request to ' + url)
279 dump = base64.b64encode(webpage_bytes).decode('ascii')
280 self._downloader.to_screen(dump)
281 if self._downloader.params.get('write_pages', False):
283 url = url_or_request.get_full_url()
284 except AttributeError:
286 basen = '%s_%s' % (video_id, url)
288 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
289 basen = basen[:240 - len(h)] + h
290 raw_filename = basen + '.dump'
291 filename = sanitize_filename(raw_filename, restricted=True)
292 self.to_screen('Saving request to ' + filename)
293 # Working around MAX_PATH limitation on Windows (see
294 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
296 absfilepath = os.path.abspath(filename)
297 if len(absfilepath) > 259:
298 filename = '\\\\?\\' + absfilepath
299 with open(filename, 'wb') as outf:
300 outf.write(webpage_bytes)
303 content = webpage_bytes.decode(encoding, 'replace')
305 content = webpage_bytes.decode('utf-8', 'replace')
307 if ('<title>Access to this site is blocked</title>' in content and
308 'Websense' in content[:512]):
309 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
310 blocked_iframe = self._html_search_regex(
311 r'<iframe src="([^"]+)"', content,
312 'Websense information URL', default=None)
314 msg += ' Visit %s for more details' % blocked_iframe
315 raise ExtractorError(msg, expected=True)
319 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
320 """ Returns the data of the page as a string """
321 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
328 def _download_xml(self, url_or_request, video_id,
329 note='Downloading XML', errnote='Unable to download XML',
330 transform_source=None, fatal=True):
331 """Return the xml as an xml.etree.ElementTree.Element"""
332 xml_string = self._download_webpage(
333 url_or_request, video_id, note, errnote, fatal=fatal)
334 if xml_string is False:
337 xml_string = transform_source(xml_string)
338 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
340 def _download_json(self, url_or_request, video_id,
341 note='Downloading JSON metadata',
342 errnote='Unable to download JSON metadata',
343 transform_source=None,
345 json_string = self._download_webpage(
346 url_or_request, video_id, note, errnote, fatal=fatal)
347 if (not fatal) and json_string is False:
350 json_string = transform_source(json_string)
352 return json.loads(json_string)
353 except ValueError as ve:
354 errmsg = '%s: Failed to parse JSON ' % video_id
356 raise ExtractorError(errmsg, cause=ve)
358 self.report_warning(errmsg + str(ve))
360 def report_warning(self, msg, video_id=None):
361 idstr = '' if video_id is None else '%s: ' % video_id
362 self._downloader.report_warning(
363 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
365 def to_screen(self, msg):
366 """Print msg to screen, prefixing it with '[ie_name]'"""
367 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
369 def report_extraction(self, id_or_name):
370 """Report information extraction."""
371 self.to_screen('%s: Extracting information' % id_or_name)
373 def report_download_webpage(self, video_id):
374 """Report webpage download."""
375 self.to_screen('%s: Downloading webpage' % video_id)
377 def report_age_confirmation(self):
378 """Report attempt to confirm age."""
379 self.to_screen('Confirming age')
381 def report_login(self):
382 """Report attempt to log in."""
383 self.to_screen('Logging in')
385 #Methods for following #608
387 def url_result(url, ie=None, video_id=None):
388 """Returns a url that points to a page that should be processed"""
389 #TODO: ie should be the class used for getting the info
390 video_info = {'_type': 'url',
393 if video_id is not None:
394 video_info['id'] = video_id
397 def playlist_result(entries, playlist_id=None, playlist_title=None):
398 """Returns a playlist"""
399 video_info = {'_type': 'playlist',
402 video_info['id'] = playlist_id
404 video_info['title'] = playlist_title
407 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
409 Perform a regex search on the given string, using a single or a list of
410 patterns returning the first matching group.
411 In case of failure return a default value or raise a WARNING or a
412 RegexNotFoundError, depending on fatal, specifying the field name.
414 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
415 mobj = re.search(pattern, string, flags)
418 mobj = re.search(p, string, flags)
422 if os.name != 'nt' and sys.stderr.isatty():
423 _name = '\033[0;34m%s\033[0m' % name
428 # return the first matching group
429 return next(g for g in mobj.groups() if g is not None)
430 elif default is not _NO_DEFAULT:
433 raise RegexNotFoundError('Unable to extract %s' % _name)
435 self._downloader.report_warning('unable to extract %s; '
436 'please report this issue on http://yt-dl.org/bug' % _name)
439 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
441 Like _search_regex, but strips HTML tags and unescapes entities.
443 res = self._search_regex(pattern, string, name, default, fatal, flags)
445 return clean_html(res).strip()
449 def _get_login_info(self):
451 Get the the login info as (username, password)
452 It will look in the netrc file using the _NETRC_MACHINE value
453 If there's no info available, return (None, None)
455 if self._downloader is None:
460 downloader_params = self._downloader.params
462 # Attempt to use provided username and password or .netrc data
463 if downloader_params.get('username', None) is not None:
464 username = downloader_params['username']
465 password = downloader_params['password']
466 elif downloader_params.get('usenetrc', False):
468 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
473 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
474 except (IOError, netrc.NetrcParseError) as err:
475 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
477 return (username, password)
479 def _get_tfa_info(self):
481 Get the two-factor authentication info
482 TODO - asking the user will be required for sms/phone verify
483 currently just uses the command line option
484 If there's no info available, return None
486 if self._downloader is None:
488 downloader_params = self._downloader.params
490 if downloader_params.get('twofactor', None) is not None:
491 return downloader_params['twofactor']
495 # Helper functions for extracting OpenGraph info
497 def _og_regexes(prop):
498 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
499 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
500 template = r'<meta[^>]+?%s[^>]+?%s'
502 template % (property_re, content_re),
503 template % (content_re, property_re),
506 def _og_search_property(self, prop, html, name=None, **kargs):
508 name = 'OpenGraph %s' % prop
509 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
512 return unescapeHTML(escaped)
514 def _og_search_thumbnail(self, html, **kargs):
515 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
517 def _og_search_description(self, html, **kargs):
518 return self._og_search_property('description', html, fatal=False, **kargs)
520 def _og_search_title(self, html, **kargs):
521 return self._og_search_property('title', html, **kargs)
523 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
524 regexes = self._og_regexes('video') + self._og_regexes('video:url')
526 regexes = self._og_regexes('video:secure_url') + regexes
527 return self._html_search_regex(regexes, html, name, **kargs)
529 def _og_search_url(self, html, **kargs):
530 return self._og_search_property('url', html, **kargs)
532 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
533 if display_name is None:
535 return self._html_search_regex(
537 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
538 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
539 html, display_name, fatal=fatal, **kwargs)
541 def _dc_search_uploader(self, html):
542 return self._html_search_meta('dc.creator', html, 'uploader')
544 def _rta_search(self, html):
545 # See http://www.rtalabel.org/index.php?content=howtofaq#single
546 if re.search(r'(?ix)<meta\s+name="rating"\s+'
547 r' content="RTA-5042-1996-1400-1577-RTA"',
552 def _media_rating_search(self, html):
553 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
554 rating = self._html_search_meta('rating', html)
566 return RATING_TABLE.get(rating.lower(), None)
568 def _twitter_search_player(self, html):
569 return self._html_search_meta('twitter:player', html,
570 'twitter card player')
572 def _sort_formats(self, formats):
574 raise ExtractorError('No video formats found')
577 # TODO remove the following workaround
578 from ..utils import determine_ext
579 if not f.get('ext') and 'url' in f:
580 f['ext'] = determine_ext(f['url'])
582 preference = f.get('preference')
583 if preference is None:
584 proto = f.get('protocol')
586 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
588 preference = 0 if proto in ['http', 'https'] else -0.1
589 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
592 if f.get('vcodec') == 'none': # audio only
593 if self._downloader.params.get('prefer_free_formats'):
594 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
596 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
599 audio_ext_preference = ORDER.index(f['ext'])
601 audio_ext_preference = -1
603 if self._downloader.params.get('prefer_free_formats'):
604 ORDER = ['flv', 'mp4', 'webm']
606 ORDER = ['webm', 'flv', 'mp4']
608 ext_preference = ORDER.index(f['ext'])
611 audio_ext_preference = 0
615 f.get('quality') if f.get('quality') is not None else -1,
616 f.get('height') if f.get('height') is not None else -1,
617 f.get('width') if f.get('width') is not None else -1,
619 f.get('tbr') if f.get('tbr') is not None else -1,
620 f.get('vbr') if f.get('vbr') is not None else -1,
621 f.get('abr') if f.get('abr') is not None else -1,
622 audio_ext_preference,
623 f.get('fps') if f.get('fps') is not None else -1,
624 f.get('filesize') if f.get('filesize') is not None else -1,
625 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
626 f.get('source_preference') if f.get('source_preference') is not None else -1,
629 formats.sort(key=_formats_key)
631 def http_scheme(self):
632 """ Either "http:" or "https:", depending on the user's preferences """
635 if self._downloader.params.get('prefer_insecure', False)
638 def _proto_relative_url(self, url, scheme=None):
641 if url.startswith('//'):
643 scheme = self.http_scheme()
648 def _sleep(self, timeout, video_id, msg_template=None):
649 if msg_template is None:
650 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
651 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
655 def _extract_f4m_formats(self, manifest_url, video_id):
656 manifest = self._download_xml(
657 manifest_url, video_id, 'Downloading f4m manifest',
658 'Unable to download f4m manifest')
661 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
662 for i, media_el in enumerate(media_nodes):
663 tbr = int_or_none(media_el.attrib.get('bitrate'))
664 format_id = 'f4m-%d' % (i if tbr is None else tbr)
666 'format_id': format_id,
670 'width': int_or_none(media_el.attrib.get('width')),
671 'height': int_or_none(media_el.attrib.get('height')),
673 self._sort_formats(formats)
677 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
678 entry_protocol='m3u8', preference=None):
681 'format_id': 'm3u8-meta',
686 'resolution': 'multiple',
687 'format_note': 'Quality selection URL',
690 format_url = lambda u: (
692 if re.match(r'^https?://', u)
693 else compat_urlparse.urljoin(m3u8_url, u))
695 m3u8_doc = self._download_webpage(
697 note='Downloading m3u8 information',
698 errnote='Failed to download m3u8 information')
701 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
702 for line in m3u8_doc.splitlines():
703 if line.startswith('#EXT-X-STREAM-INF:'):
705 for m in kv_rex.finditer(line):
707 if v.startswith('"'):
709 last_info[m.group('key')] = v
710 elif line.startswith('#') or not line.strip():
713 if last_info is None:
714 formats.append({'url': format_url(line)})
716 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
719 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
720 'url': format_url(line.strip()),
723 'protocol': entry_protocol,
724 'preference': preference,
726 codecs = last_info.get('CODECS')
728 # TODO: looks like video codec is not always necessarily goes first
729 va_codecs = codecs.split(',')
731 f['vcodec'] = va_codecs[0].partition('.')[0]
732 if len(va_codecs) > 1 and va_codecs[1]:
733 f['acodec'] = va_codecs[1].partition('.')[0]
734 resolution = last_info.get('RESOLUTION')
736 width_str, height_str = resolution.split('x')
737 f['width'] = int(width_str)
738 f['height'] = int(height_str)
741 self._sort_formats(formats)
744 def _live_title(self, name):
745 """ Generate the title for a live video """
746 now = datetime.datetime.now()
747 now_str = now.strftime("%Y-%m-%d %H:%M")
748 return name + ' ' + now_str
750 def _int(self, v, name, fatal=False, **kwargs):
751 res = int_or_none(v, **kwargs)
752 if 'get_attr' in kwargs:
753 print(getattr(v, kwargs['get_attr']))
755 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
757 raise ExtractorError(msg)
759 self._downloader.report_warning(msg)
762 def _float(self, v, name, fatal=False, **kwargs):
763 res = float_or_none(v, **kwargs)
765 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
767 raise ExtractorError(msg)
769 self._downloader.report_warning(msg)
773 class SearchInfoExtractor(InfoExtractor):
775 Base class for paged search queries extractors.
776 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
777 Instances should define _SEARCH_KEY and _MAX_RESULTS.
781 def _make_valid_url(cls):
782 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
785 def suitable(cls, url):
786 return re.match(cls._make_valid_url(), url) is not None
788 def _real_extract(self, query):
789 mobj = re.match(self._make_valid_url(), query)
791 raise ExtractorError('Invalid search query "%s"' % query)
793 prefix = mobj.group('prefix')
794 query = mobj.group('query')
796 return self._get_n_results(query, 1)
797 elif prefix == 'all':
798 return self._get_n_results(query, self._MAX_RESULTS)
802 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
803 elif n > self._MAX_RESULTS:
804 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
805 n = self._MAX_RESULTS
806 return self._get_n_results(query, n)
808 def _get_n_results(self, query, n):
809 """Get a specified number of results for a query"""
810 raise NotImplementedError("This method must be implemented by subclasses")
813 def SEARCH_KEY(self):
814 return self._SEARCH_KEY