1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
18 compat_urllib_parse_urlparse,
31 _NO_DEFAULT = object()
34 class InfoExtractor(object):
35 """Information Extractor class.
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
45 The dictionaries must include the following fields:
48 title: Video title, unescaped.
50 Additionally, it must contain either a formats entry or a url one:
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * vcodec Name of the video codec in use
76 * container Name of the container format
77 * filesize The number of bytes, if known in advance
78 * filesize_approx An estimate for the number of bytes
79 * player_url SWF Player URL (used for rtmpdump).
80 * protocol The protocol that will be used for the actual
82 "http", "https", "rtsp", "rtmp", "m3u8" or so.
83 * preference Order number of this format. If this field is
84 present and not None, the formats get sorted
85 by this field, regardless of all other values.
86 -1 for default (order by other properties),
87 -2 or smaller for less than default.
88 * quality Order number of the video quality of this
89 format, irrespective of the file format.
90 -1 for default (order by other properties),
91 -2 or smaller for less than default.
92 * http_referer HTTP Referer header value to set.
93 * http_method HTTP method to use for the download.
94 * http_headers A dictionary of additional HTTP headers
95 to add to the request.
96 * http_post_data Additional data to send with a POST
99 ext: Video filename extension.
100 format: The video format, defaults to ext (used for --get-format)
101 player_url: SWF Player URL (used for rtmpdump).
103 The following fields are optional:
105 display_id An alternative identifier for the video, not necessarily
106 unique, but available before title. Typically, id is
107 something like "4234987", title "Dancing naked mole rats",
108 and display_id "dancing-naked-mole-rats"
109 thumbnails: A list of dictionaries, with the following entries:
111 * "width" (optional, int)
112 * "height" (optional, int)
113 * "resolution" (optional, string "{width}x{height"},
115 thumbnail: Full URL to a video thumbnail image.
116 description: One-line video description.
117 uploader: Full name of the video uploader.
118 timestamp: UNIX timestamp of the moment the video became available.
119 upload_date: Video upload date (YYYYMMDD).
120 If not explicitly set, calculated from timestamp.
121 uploader_id: Nickname or id of the video uploader.
122 location: Physical location where the video was filmed.
123 subtitles: The subtitle file contents as a dictionary in the format
124 {language: subtitles}.
125 duration: Length of the video in seconds, as an integer.
126 view_count: How many users have watched the video on the platform.
127 like_count: Number of positive ratings of the video
128 dislike_count: Number of negative ratings of the video
129 comment_count: Number of comments on the video
130 age_limit: Age restriction for the video, as an integer (years)
131 webpage_url: The url to the video webpage, if given to youtube-dl it
132 should allow to get the same result again. (It will be set
133 by YoutubeDL if it's missing)
134 categories: A list of categories that the video falls in, for example
136 is_live: True, False, or None (=unknown). Whether this video is a
137 live stream that goes on instead of a fixed-length video.
139 Unless mentioned otherwise, the fields should be Unicode strings.
141 Subclasses of this one should re-define the _real_initialize() and
142 _real_extract() methods and define a _VALID_URL regexp.
143 Probably, they should also be added to the list of extractors.
145 Finally, the _WORKING attribute should be set to False for broken IEs
146 in order to warn the users and skip the tests.
153 def __init__(self, downloader=None):
154 """Constructor. Receives an optional downloader."""
156 self.set_downloader(downloader)
159 def suitable(cls, url):
160 """Receives a URL and returns True if suitable for this IE."""
162 # This does not use has/getattr intentionally - we want to know whether
163 # we have cached the regexp for *this* class, whereas getattr would also
164 # match the superclass
165 if '_VALID_URL_RE' not in cls.__dict__:
166 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
167 return cls._VALID_URL_RE.match(url) is not None
170 def _match_id(cls, url):
171 if '_VALID_URL_RE' not in cls.__dict__:
172 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
173 m = cls._VALID_URL_RE.match(url)
179 """Getter method for _WORKING."""
182 def initialize(self):
183 """Initializes an instance (authentication, etc)."""
185 self._real_initialize()
188 def extract(self, url):
189 """Extracts URL information and returns it in list of dicts."""
191 return self._real_extract(url)
193 def set_downloader(self, downloader):
194 """Sets the downloader for this IE."""
195 self._downloader = downloader
197 def _real_initialize(self):
198 """Real initialization process. Redefine in subclasses."""
201 def _real_extract(self, url):
202 """Real extraction process. Redefine in subclasses."""
207 """A string for getting the InfoExtractor with get_info_extractor"""
208 return cls.__name__[:-2]
212 return type(self).__name__[:-2]
214 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
215 """ Returns the response handle """
217 self.report_download_webpage(video_id)
218 elif note is not False:
220 self.to_screen('%s' % (note,))
222 self.to_screen('%s: %s' % (video_id, note))
224 return self._downloader.urlopen(url_or_request)
225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
229 errnote = 'Unable to download webpage'
230 errmsg = '%s: %s' % (errnote, compat_str(err))
232 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
234 self._downloader.report_warning(errmsg)
237 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
238 """ Returns a tuple (page content as string, URL handle) """
240 # Strip hashes from the URL (#1038)
241 if isinstance(url_or_request, (compat_str, str)):
242 url_or_request = url_or_request.partition('#')[0]
244 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
248 content_type = urlh.headers.get('Content-Type', '')
249 webpage_bytes = urlh.read()
250 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
252 encoding = m.group(1)
254 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
255 webpage_bytes[:1024])
257 encoding = m.group(1).decode('ascii')
258 elif webpage_bytes.startswith(b'\xff\xfe'):
262 if self._downloader.params.get('dump_intermediate_pages', False):
264 url = url_or_request.get_full_url()
265 except AttributeError:
267 self.to_screen('Dumping request to ' + url)
268 dump = base64.b64encode(webpage_bytes).decode('ascii')
269 self._downloader.to_screen(dump)
270 if self._downloader.params.get('write_pages', False):
272 url = url_or_request.get_full_url()
273 except AttributeError:
275 basen = '%s_%s' % (video_id, url)
277 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
278 basen = basen[:240 - len(h)] + h
279 raw_filename = basen + '.dump'
280 filename = sanitize_filename(raw_filename, restricted=True)
281 self.to_screen('Saving request to ' + filename)
282 with open(filename, 'wb') as outf:
283 outf.write(webpage_bytes)
286 content = webpage_bytes.decode(encoding, 'replace')
288 content = webpage_bytes.decode('utf-8', 'replace')
290 if ('<title>Access to this site is blocked</title>' in content and
291 'Websense' in content[:512]):
292 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
293 blocked_iframe = self._html_search_regex(
294 r'<iframe src="([^"]+)"', content,
295 'Websense information URL', default=None)
297 msg += ' Visit %s for more details' % blocked_iframe
298 raise ExtractorError(msg, expected=True)
300 return (content, urlh)
302 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
303 """ Returns the data of the page as a string """
304 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
311 def _download_xml(self, url_or_request, video_id,
312 note='Downloading XML', errnote='Unable to download XML',
313 transform_source=None, fatal=True):
314 """Return the xml as an xml.etree.ElementTree.Element"""
315 xml_string = self._download_webpage(
316 url_or_request, video_id, note, errnote, fatal=fatal)
317 if xml_string is False:
320 xml_string = transform_source(xml_string)
321 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
323 def _download_json(self, url_or_request, video_id,
324 note='Downloading JSON metadata',
325 errnote='Unable to download JSON metadata',
326 transform_source=None,
328 json_string = self._download_webpage(
329 url_or_request, video_id, note, errnote, fatal=fatal)
330 if (not fatal) and json_string is False:
333 json_string = transform_source(json_string)
335 return json.loads(json_string)
336 except ValueError as ve:
337 errmsg = '%s: Failed to parse JSON ' % video_id
339 raise ExtractorError(errmsg, cause=ve)
341 self.report_warning(errmsg + str(ve))
343 def report_warning(self, msg, video_id=None):
344 idstr = '' if video_id is None else '%s: ' % video_id
345 self._downloader.report_warning(
346 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
348 def to_screen(self, msg):
349 """Print msg to screen, prefixing it with '[ie_name]'"""
350 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
352 def report_extraction(self, id_or_name):
353 """Report information extraction."""
354 self.to_screen('%s: Extracting information' % id_or_name)
356 def report_download_webpage(self, video_id):
357 """Report webpage download."""
358 self.to_screen('%s: Downloading webpage' % video_id)
360 def report_age_confirmation(self):
361 """Report attempt to confirm age."""
362 self.to_screen('Confirming age')
364 def report_login(self):
365 """Report attempt to log in."""
366 self.to_screen('Logging in')
368 #Methods for following #608
370 def url_result(url, ie=None, video_id=None):
371 """Returns a url that points to a page that should be processed"""
372 #TODO: ie should be the class used for getting the info
373 video_info = {'_type': 'url',
376 if video_id is not None:
377 video_info['id'] = video_id
380 def playlist_result(entries, playlist_id=None, playlist_title=None):
381 """Returns a playlist"""
382 video_info = {'_type': 'playlist',
385 video_info['id'] = playlist_id
387 video_info['title'] = playlist_title
390 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
392 Perform a regex search on the given string, using a single or a list of
393 patterns returning the first matching group.
394 In case of failure return a default value or raise a WARNING or a
395 RegexNotFoundError, depending on fatal, specifying the field name.
397 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
398 mobj = re.search(pattern, string, flags)
401 mobj = re.search(p, string, flags)
405 if os.name != 'nt' and sys.stderr.isatty():
406 _name = '\033[0;34m%s\033[0m' % name
411 # return the first matching group
412 return next(g for g in mobj.groups() if g is not None)
413 elif default is not _NO_DEFAULT:
416 raise RegexNotFoundError('Unable to extract %s' % _name)
418 self._downloader.report_warning('unable to extract %s; '
419 'please report this issue on http://yt-dl.org/bug' % _name)
422 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
424 Like _search_regex, but strips HTML tags and unescapes entities.
426 res = self._search_regex(pattern, string, name, default, fatal, flags)
428 return clean_html(res).strip()
432 def _get_login_info(self):
434 Get the the login info as (username, password)
435 It will look in the netrc file using the _NETRC_MACHINE value
436 If there's no info available, return (None, None)
438 if self._downloader is None:
443 downloader_params = self._downloader.params
445 # Attempt to use provided username and password or .netrc data
446 if downloader_params.get('username', None) is not None:
447 username = downloader_params['username']
448 password = downloader_params['password']
449 elif downloader_params.get('usenetrc', False):
451 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
456 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
457 except (IOError, netrc.NetrcParseError) as err:
458 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
460 return (username, password)
462 def _get_tfa_info(self):
464 Get the two-factor authentication info
465 TODO - asking the user will be required for sms/phone verify
466 currently just uses the command line option
467 If there's no info available, return None
469 if self._downloader is None:
471 downloader_params = self._downloader.params
473 if downloader_params.get('twofactor', None) is not None:
474 return downloader_params['twofactor']
478 # Helper functions for extracting OpenGraph info
480 def _og_regexes(prop):
481 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
482 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
483 template = r'<meta[^>]+?%s[^>]+?%s'
485 template % (property_re, content_re),
486 template % (content_re, property_re),
489 def _og_search_property(self, prop, html, name=None, **kargs):
491 name = 'OpenGraph %s' % prop
492 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
495 return unescapeHTML(escaped)
497 def _og_search_thumbnail(self, html, **kargs):
498 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
500 def _og_search_description(self, html, **kargs):
501 return self._og_search_property('description', html, fatal=False, **kargs)
503 def _og_search_title(self, html, **kargs):
504 return self._og_search_property('title', html, **kargs)
506 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
507 regexes = self._og_regexes('video') + self._og_regexes('video:url')
509 regexes = self._og_regexes('video:secure_url') + regexes
510 return self._html_search_regex(regexes, html, name, **kargs)
512 def _og_search_url(self, html, **kargs):
513 return self._og_search_property('url', html, **kargs)
515 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
516 if display_name is None:
518 return self._html_search_regex(
520 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
521 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
522 html, display_name, fatal=fatal, **kwargs)
524 def _dc_search_uploader(self, html):
525 return self._html_search_meta('dc.creator', html, 'uploader')
527 def _rta_search(self, html):
528 # See http://www.rtalabel.org/index.php?content=howtofaq#single
529 if re.search(r'(?ix)<meta\s+name="rating"\s+'
530 r' content="RTA-5042-1996-1400-1577-RTA"',
535 def _media_rating_search(self, html):
536 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
537 rating = self._html_search_meta('rating', html)
549 return RATING_TABLE.get(rating.lower(), None)
551 def _twitter_search_player(self, html):
552 return self._html_search_meta('twitter:player', html,
553 'twitter card player')
555 def _sort_formats(self, formats):
557 raise ExtractorError('No video formats found')
560 # TODO remove the following workaround
561 from ..utils import determine_ext
562 if not f.get('ext') and 'url' in f:
563 f['ext'] = determine_ext(f['url'])
565 preference = f.get('preference')
566 if preference is None:
567 proto = f.get('protocol')
569 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
571 preference = 0 if proto in ['http', 'https'] else -0.1
572 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
575 if f.get('vcodec') == 'none': # audio only
576 if self._downloader.params.get('prefer_free_formats'):
577 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
579 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
582 audio_ext_preference = ORDER.index(f['ext'])
584 audio_ext_preference = -1
586 if self._downloader.params.get('prefer_free_formats'):
587 ORDER = ['flv', 'mp4', 'webm']
589 ORDER = ['webm', 'flv', 'mp4']
591 ext_preference = ORDER.index(f['ext'])
594 audio_ext_preference = 0
598 f.get('quality') if f.get('quality') is not None else -1,
599 f.get('height') if f.get('height') is not None else -1,
600 f.get('width') if f.get('width') is not None else -1,
602 f.get('tbr') if f.get('tbr') is not None else -1,
603 f.get('vbr') if f.get('vbr') is not None else -1,
604 f.get('abr') if f.get('abr') is not None else -1,
605 audio_ext_preference,
606 f.get('filesize') if f.get('filesize') is not None else -1,
607 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
610 formats.sort(key=_formats_key)
612 def http_scheme(self):
613 """ Either "https:" or "https:", depending on the user's preferences """
616 if self._downloader.params.get('prefer_insecure', False)
619 def _proto_relative_url(self, url, scheme=None):
622 if url.startswith('//'):
624 scheme = self.http_scheme()
629 def _sleep(self, timeout, video_id, msg_template=None):
630 if msg_template is None:
631 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
632 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
636 def _extract_f4m_formats(self, manifest_url, video_id):
637 manifest = self._download_xml(
638 manifest_url, video_id, 'Downloading f4m manifest',
639 'Unable to download f4m manifest')
642 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
643 for i, media_el in enumerate(media_nodes):
644 tbr = int_or_none(media_el.attrib.get('bitrate'))
645 format_id = 'f4m-%d' % (i if tbr is None else tbr)
647 'format_id': format_id,
651 'width': int_or_none(media_el.attrib.get('width')),
652 'height': int_or_none(media_el.attrib.get('height')),
654 self._sort_formats(formats)
658 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
659 entry_protocol='m3u8', preference=None):
662 'format_id': 'm3u8-meta',
667 'resolution': 'multiple',
668 'format_note': 'Quality selection URL',
671 format_url = lambda u: (
673 if re.match(r'^https?://', u)
674 else compat_urlparse.urljoin(m3u8_url, u))
676 m3u8_doc = self._download_webpage(m3u8_url, video_id)
679 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
680 for line in m3u8_doc.splitlines():
681 if line.startswith('#EXT-X-STREAM-INF:'):
683 for m in kv_rex.finditer(line):
685 if v.startswith('"'):
687 last_info[m.group('key')] = v
688 elif line.startswith('#') or not line.strip():
691 if last_info is None:
692 formats.append({'url': format_url(line)})
694 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
697 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
698 'url': format_url(line.strip()),
701 'protocol': entry_protocol,
702 'preference': preference,
704 codecs = last_info.get('CODECS')
706 # TODO: looks like video codec is not always necessarily goes first
707 va_codecs = codecs.split(',')
709 f['vcodec'] = va_codecs[0].partition('.')[0]
710 if len(va_codecs) > 1 and va_codecs[1]:
711 f['acodec'] = va_codecs[1].partition('.')[0]
712 resolution = last_info.get('RESOLUTION')
714 width_str, height_str = resolution.split('x')
715 f['width'] = int(width_str)
716 f['height'] = int(height_str)
719 self._sort_formats(formats)
722 def _live_title(self, name):
723 """ Generate the title for a live video """
724 now = datetime.datetime.now()
725 now_str = now.strftime("%Y-%m-%d %H:%M")
726 return name + ' ' + now_str
728 def _int(self, v, name, fatal=False, **kwargs):
729 res = int_or_none(v, **kwargs)
730 if 'get_attr' in kwargs:
731 print(getattr(v, kwargs['get_attr']))
733 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
735 raise ExtractorError(msg)
737 self._downloader.report_warning(msg)
740 def _float(self, v, name, fatal=False, **kwargs):
741 res = float_or_none(v, **kwargs)
743 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
745 raise ExtractorError(msg)
747 self._downloader.report_warning(msg)
751 class SearchInfoExtractor(InfoExtractor):
753 Base class for paged search queries extractors.
754 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
755 Instances should define _SEARCH_KEY and _MAX_RESULTS.
759 def _make_valid_url(cls):
760 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
763 def suitable(cls, url):
764 return re.match(cls._make_valid_url(), url) is not None
766 def _real_extract(self, query):
767 mobj = re.match(self._make_valid_url(), query)
769 raise ExtractorError('Invalid search query "%s"' % query)
771 prefix = mobj.group('prefix')
772 query = mobj.group('query')
774 return self._get_n_results(query, 1)
775 elif prefix == 'all':
776 return self._get_n_results(query, self._MAX_RESULTS)
780 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
781 elif n > self._MAX_RESULTS:
782 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
783 n = self._MAX_RESULTS
784 return self._get_n_results(query, n)
786 def _get_n_results(self, query, n):
787 """Get a specified number of results for a query"""
788 raise NotImplementedError("This method must be implemented by subclasses")
791 def SEARCH_KEY(self):
792 return self._SEARCH_KEY