1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
18 compat_urllib_parse_urlparse,
32 _NO_DEFAULT = object()
35 class InfoExtractor(object):
36 """Information Extractor class.
38 Information extractors are the classes that, given a URL, extract
39 information about the video (or videos) the URL refers to. This
40 information includes the real video URL, the video title, author and
41 others. The information is stored in a dictionary which is then
42 passed to the FileDownloader. The FileDownloader processes this
43 information possibly downloading the video to the file system, among
44 other possible outcomes.
46 The dictionaries must include the following fields:
49 title: Video title, unescaped.
51 Additionally, it must contain either a formats entry or a url one:
53 formats: A list of dictionaries for each format available, ordered
54 from worst to best quality.
57 * url Mandatory. The URL of the video file
58 * ext Will be calculated from url if missing
59 * format A human-readable description of the format
60 ("mp4 container with h264/opus").
61 Calculated from the format_id, width, height.
62 and format_note fields if missing.
63 * format_id A short description of the format
64 ("mp4_h264_opus" or "19").
65 Technically optional, but strongly recommended.
66 * format_note Additional info about the format
67 ("3D" or "DASH video")
68 * width Width of the video, if known
69 * height Height of the video, if known
70 * resolution Textual description of width and height
71 * tbr Average bitrate of audio and video in KBit/s
72 * abr Average audio bitrate in KBit/s
73 * acodec Name of the audio codec in use
74 * asr Audio sampling rate in Hertz
75 * vbr Average video bitrate in KBit/s
77 * vcodec Name of the video codec in use
78 * container Name of the container format
79 * filesize The number of bytes, if known in advance
80 * filesize_approx An estimate for the number of bytes
81 * player_url SWF Player URL (used for rtmpdump).
82 * protocol The protocol that will be used for the actual
84 "http", "https", "rtsp", "rtmp", "m3u8" or so.
85 * preference Order number of this format. If this field is
86 present and not None, the formats get sorted
87 by this field, regardless of all other values.
88 -1 for default (order by other properties),
89 -2 or smaller for less than default.
90 * language_preference Is this in the correct requested
92 10 if it's what the URL is about,
93 -1 for default (don't know),
94 -10 otherwise, other values reserved for now.
95 * quality Order number of the video quality of this
96 format, irrespective of the file format.
97 -1 for default (order by other properties),
98 -2 or smaller for less than default.
99 * source_preference Order number for this video source
100 (quality takes higher priority)
101 -1 for default (order by other properties),
102 -2 or smaller for less than default.
103 * http_referer HTTP Referer header value to set.
104 * http_method HTTP method to use for the download.
105 * http_headers A dictionary of additional HTTP headers
106 to add to the request.
107 * http_post_data Additional data to send with a POST
109 url: Final video URL.
110 ext: Video filename extension.
111 format: The video format, defaults to ext (used for --get-format)
112 player_url: SWF Player URL (used for rtmpdump).
114 The following fields are optional:
116 display_id An alternative identifier for the video, not necessarily
117 unique, but available before title. Typically, id is
118 something like "4234987", title "Dancing naked mole rats",
119 and display_id "dancing-naked-mole-rats"
120 thumbnails: A list of dictionaries, with the following entries:
122 * "width" (optional, int)
123 * "height" (optional, int)
124 * "resolution" (optional, string "{width}x{height"},
126 thumbnail: Full URL to a video thumbnail image.
127 description: One-line video description.
128 uploader: Full name of the video uploader.
129 timestamp: UNIX timestamp of the moment the video became available.
130 upload_date: Video upload date (YYYYMMDD).
131 If not explicitly set, calculated from timestamp.
132 uploader_id: Nickname or id of the video uploader.
133 location: Physical location where the video was filmed.
134 subtitles: The subtitle file contents as a dictionary in the format
135 {language: subtitles}.
136 duration: Length of the video in seconds, as an integer.
137 view_count: How many users have watched the video on the platform.
138 like_count: Number of positive ratings of the video
139 dislike_count: Number of negative ratings of the video
140 comment_count: Number of comments on the video
141 age_limit: Age restriction for the video, as an integer (years)
142 webpage_url: The url to the video webpage, if given to youtube-dl it
143 should allow to get the same result again. (It will be set
144 by YoutubeDL if it's missing)
145 categories: A list of categories that the video falls in, for example
147 is_live: True, False, or None (=unknown). Whether this video is a
148 live stream that goes on instead of a fixed-length video.
150 Unless mentioned otherwise, the fields should be Unicode strings.
152 Unless mentioned otherwise, None is equivalent to absence of information.
154 Subclasses of this one should re-define the _real_initialize() and
155 _real_extract() methods and define a _VALID_URL regexp.
156 Probably, they should also be added to the list of extractors.
158 Finally, the _WORKING attribute should be set to False for broken IEs
159 in order to warn the users and skip the tests.
166 def __init__(self, downloader=None):
167 """Constructor. Receives an optional downloader."""
169 self.set_downloader(downloader)
172 def suitable(cls, url):
173 """Receives a URL and returns True if suitable for this IE."""
175 # This does not use has/getattr intentionally - we want to know whether
176 # we have cached the regexp for *this* class, whereas getattr would also
177 # match the superclass
178 if '_VALID_URL_RE' not in cls.__dict__:
179 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
180 return cls._VALID_URL_RE.match(url) is not None
183 def _match_id(cls, url):
184 if '_VALID_URL_RE' not in cls.__dict__:
185 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
186 m = cls._VALID_URL_RE.match(url)
192 """Getter method for _WORKING."""
195 def initialize(self):
196 """Initializes an instance (authentication, etc)."""
198 self._real_initialize()
201 def extract(self, url):
202 """Extracts URL information and returns it in list of dicts."""
204 return self._real_extract(url)
206 def set_downloader(self, downloader):
207 """Sets the downloader for this IE."""
208 self._downloader = downloader
210 def _real_initialize(self):
211 """Real initialization process. Redefine in subclasses."""
214 def _real_extract(self, url):
215 """Real extraction process. Redefine in subclasses."""
220 """A string for getting the InfoExtractor with get_info_extractor"""
221 return cls.__name__[:-2]
225 return type(self).__name__[:-2]
227 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
228 """ Returns the response handle """
230 self.report_download_webpage(video_id)
231 elif note is not False:
233 self.to_screen('%s' % (note,))
235 self.to_screen('%s: %s' % (video_id, note))
237 return self._downloader.urlopen(url_or_request)
238 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
242 errnote = 'Unable to download webpage'
243 errmsg = '%s: %s' % (errnote, compat_str(err))
245 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
247 self._downloader.report_warning(errmsg)
250 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
251 """ Returns a tuple (page content as string, URL handle) """
252 # Strip hashes from the URL (#1038)
253 if isinstance(url_or_request, (compat_str, str)):
254 url_or_request = url_or_request.partition('#')[0]
256 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
260 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
261 return (content, urlh)
263 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
264 content_type = urlh.headers.get('Content-Type', '')
265 webpage_bytes = urlh.read()
266 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
268 encoding = m.group(1)
270 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
271 webpage_bytes[:1024])
273 encoding = m.group(1).decode('ascii')
274 elif webpage_bytes.startswith(b'\xff\xfe'):
278 if self._downloader.params.get('dump_intermediate_pages', False):
280 url = url_or_request.get_full_url()
281 except AttributeError:
283 self.to_screen('Dumping request to ' + url)
284 dump = base64.b64encode(webpage_bytes).decode('ascii')
285 self._downloader.to_screen(dump)
286 if self._downloader.params.get('write_pages', False):
288 url = url_or_request.get_full_url()
289 except AttributeError:
291 basen = '%s_%s' % (video_id, url)
293 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
294 basen = basen[:240 - len(h)] + h
295 raw_filename = basen + '.dump'
296 filename = sanitize_filename(raw_filename, restricted=True)
297 self.to_screen('Saving request to ' + filename)
298 # Working around MAX_PATH limitation on Windows (see
299 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
301 absfilepath = os.path.abspath(filename)
302 if len(absfilepath) > 259:
303 filename = '\\\\?\\' + absfilepath
304 with open(filename, 'wb') as outf:
305 outf.write(webpage_bytes)
308 content = webpage_bytes.decode(encoding, 'replace')
310 content = webpage_bytes.decode('utf-8', 'replace')
312 if ('<title>Access to this site is blocked</title>' in content and
313 'Websense' in content[:512]):
314 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
315 blocked_iframe = self._html_search_regex(
316 r'<iframe src="([^"]+)"', content,
317 'Websense information URL', default=None)
319 msg += ' Visit %s for more details' % blocked_iframe
320 raise ExtractorError(msg, expected=True)
324 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
325 """ Returns the data of the page as a string """
326 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
333 def _download_xml(self, url_or_request, video_id,
334 note='Downloading XML', errnote='Unable to download XML',
335 transform_source=None, fatal=True):
336 """Return the xml as an xml.etree.ElementTree.Element"""
337 xml_string = self._download_webpage(
338 url_or_request, video_id, note, errnote, fatal=fatal)
339 if xml_string is False:
342 xml_string = transform_source(xml_string)
343 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
345 def _download_json(self, url_or_request, video_id,
346 note='Downloading JSON metadata',
347 errnote='Unable to download JSON metadata',
348 transform_source=None,
350 json_string = self._download_webpage(
351 url_or_request, video_id, note, errnote, fatal=fatal)
352 if (not fatal) and json_string is False:
355 json_string = transform_source(json_string)
357 return json.loads(json_string)
358 except ValueError as ve:
359 errmsg = '%s: Failed to parse JSON ' % video_id
361 raise ExtractorError(errmsg, cause=ve)
363 self.report_warning(errmsg + str(ve))
365 def report_warning(self, msg, video_id=None):
366 idstr = '' if video_id is None else '%s: ' % video_id
367 self._downloader.report_warning(
368 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
370 def to_screen(self, msg):
371 """Print msg to screen, prefixing it with '[ie_name]'"""
372 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
374 def report_extraction(self, id_or_name):
375 """Report information extraction."""
376 self.to_screen('%s: Extracting information' % id_or_name)
378 def report_download_webpage(self, video_id):
379 """Report webpage download."""
380 self.to_screen('%s: Downloading webpage' % video_id)
382 def report_age_confirmation(self):
383 """Report attempt to confirm age."""
384 self.to_screen('Confirming age')
386 def report_login(self):
387 """Report attempt to log in."""
388 self.to_screen('Logging in')
390 #Methods for following #608
392 def url_result(url, ie=None, video_id=None):
393 """Returns a url that points to a page that should be processed"""
394 #TODO: ie should be the class used for getting the info
395 video_info = {'_type': 'url',
398 if video_id is not None:
399 video_info['id'] = video_id
402 def playlist_result(entries, playlist_id=None, playlist_title=None):
403 """Returns a playlist"""
404 video_info = {'_type': 'playlist',
407 video_info['id'] = playlist_id
409 video_info['title'] = playlist_title
412 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
414 Perform a regex search on the given string, using a single or a list of
415 patterns returning the first matching group.
416 In case of failure return a default value or raise a WARNING or a
417 RegexNotFoundError, depending on fatal, specifying the field name.
419 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
420 mobj = re.search(pattern, string, flags)
423 mobj = re.search(p, string, flags)
427 if os.name != 'nt' and sys.stderr.isatty():
428 _name = '\033[0;34m%s\033[0m' % name
434 # return the first matching group
435 return next(g for g in mobj.groups() if g is not None)
437 return mobj.group(group)
438 elif default is not _NO_DEFAULT:
441 raise RegexNotFoundError('Unable to extract %s' % _name)
443 self._downloader.report_warning('unable to extract %s; '
444 'please report this issue on http://yt-dl.org/bug' % _name)
447 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
449 Like _search_regex, but strips HTML tags and unescapes entities.
451 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
453 return clean_html(res).strip()
457 def _get_login_info(self):
459 Get the the login info as (username, password)
460 It will look in the netrc file using the _NETRC_MACHINE value
461 If there's no info available, return (None, None)
463 if self._downloader is None:
468 downloader_params = self._downloader.params
470 # Attempt to use provided username and password or .netrc data
471 if downloader_params.get('username', None) is not None:
472 username = downloader_params['username']
473 password = downloader_params['password']
474 elif downloader_params.get('usenetrc', False):
476 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
481 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
482 except (IOError, netrc.NetrcParseError) as err:
483 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
485 return (username, password)
487 def _get_tfa_info(self):
489 Get the two-factor authentication info
490 TODO - asking the user will be required for sms/phone verify
491 currently just uses the command line option
492 If there's no info available, return None
494 if self._downloader is None:
496 downloader_params = self._downloader.params
498 if downloader_params.get('twofactor', None) is not None:
499 return downloader_params['twofactor']
503 # Helper functions for extracting OpenGraph info
505 def _og_regexes(prop):
506 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
507 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
508 template = r'<meta[^>]+?%s[^>]+?%s'
510 template % (property_re, content_re),
511 template % (content_re, property_re),
514 def _og_search_property(self, prop, html, name=None, **kargs):
516 name = 'OpenGraph %s' % prop
517 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
520 return unescapeHTML(escaped)
522 def _og_search_thumbnail(self, html, **kargs):
523 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
525 def _og_search_description(self, html, **kargs):
526 return self._og_search_property('description', html, fatal=False, **kargs)
528 def _og_search_title(self, html, **kargs):
529 return self._og_search_property('title', html, **kargs)
531 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
532 regexes = self._og_regexes('video') + self._og_regexes('video:url')
534 regexes = self._og_regexes('video:secure_url') + regexes
535 return self._html_search_regex(regexes, html, name, **kargs)
537 def _og_search_url(self, html, **kargs):
538 return self._og_search_property('url', html, **kargs)
540 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
541 if display_name is None:
543 return self._html_search_regex(
545 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
546 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
547 html, display_name, fatal=fatal, group='content', **kwargs)
549 def _dc_search_uploader(self, html):
550 return self._html_search_meta('dc.creator', html, 'uploader')
552 def _rta_search(self, html):
553 # See http://www.rtalabel.org/index.php?content=howtofaq#single
554 if re.search(r'(?ix)<meta\s+name="rating"\s+'
555 r' content="RTA-5042-1996-1400-1577-RTA"',
560 def _media_rating_search(self, html):
561 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
562 rating = self._html_search_meta('rating', html)
574 return RATING_TABLE.get(rating.lower(), None)
576 def _twitter_search_player(self, html):
577 return self._html_search_meta('twitter:player', html,
578 'twitter card player')
580 def _sort_formats(self, formats):
582 raise ExtractorError('No video formats found')
585 # TODO remove the following workaround
586 from ..utils import determine_ext
587 if not f.get('ext') and 'url' in f:
588 f['ext'] = determine_ext(f['url'])
590 preference = f.get('preference')
591 if preference is None:
592 proto = f.get('protocol')
594 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
596 preference = 0 if proto in ['http', 'https'] else -0.1
597 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
600 if f.get('vcodec') == 'none': # audio only
601 if self._downloader.params.get('prefer_free_formats'):
602 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
604 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
607 audio_ext_preference = ORDER.index(f['ext'])
609 audio_ext_preference = -1
611 if self._downloader.params.get('prefer_free_formats'):
612 ORDER = ['flv', 'mp4', 'webm']
614 ORDER = ['webm', 'flv', 'mp4']
616 ext_preference = ORDER.index(f['ext'])
619 audio_ext_preference = 0
623 f.get('language_preference') if f.get('language_preference') is not None else -1,
624 f.get('quality') if f.get('quality') is not None else -1,
625 f.get('height') if f.get('height') is not None else -1,
626 f.get('width') if f.get('width') is not None else -1,
628 f.get('tbr') if f.get('tbr') is not None else -1,
629 f.get('vbr') if f.get('vbr') is not None else -1,
630 f.get('abr') if f.get('abr') is not None else -1,
631 audio_ext_preference,
632 f.get('fps') if f.get('fps') is not None else -1,
633 f.get('filesize') if f.get('filesize') is not None else -1,
634 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
635 f.get('source_preference') if f.get('source_preference') is not None else -1,
638 formats.sort(key=_formats_key)
640 def http_scheme(self):
641 """ Either "http:" or "https:", depending on the user's preferences """
644 if self._downloader.params.get('prefer_insecure', False)
647 def _proto_relative_url(self, url, scheme=None):
650 if url.startswith('//'):
652 scheme = self.http_scheme()
657 def _sleep(self, timeout, video_id, msg_template=None):
658 if msg_template is None:
659 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
660 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
664 def _extract_f4m_formats(self, manifest_url, video_id):
665 manifest = self._download_xml(
666 manifest_url, video_id, 'Downloading f4m manifest',
667 'Unable to download f4m manifest')
670 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
671 for i, media_el in enumerate(media_nodes):
672 tbr = int_or_none(media_el.attrib.get('bitrate'))
673 format_id = 'f4m-%d' % (i if tbr is None else tbr)
675 'format_id': format_id,
679 'width': int_or_none(media_el.attrib.get('width')),
680 'height': int_or_none(media_el.attrib.get('height')),
682 self._sort_formats(formats)
686 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
687 entry_protocol='m3u8', preference=None):
690 'format_id': 'm3u8-meta',
695 'resolution': 'multiple',
696 'format_note': 'Quality selection URL',
699 format_url = lambda u: (
701 if re.match(r'^https?://', u)
702 else compat_urlparse.urljoin(m3u8_url, u))
704 m3u8_doc = self._download_webpage(
706 note='Downloading m3u8 information',
707 errnote='Failed to download m3u8 information')
710 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
711 for line in m3u8_doc.splitlines():
712 if line.startswith('#EXT-X-STREAM-INF:'):
714 for m in kv_rex.finditer(line):
716 if v.startswith('"'):
718 last_info[m.group('key')] = v
719 elif line.startswith('#') or not line.strip():
722 if last_info is None:
723 formats.append({'url': format_url(line)})
725 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
728 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
729 'url': format_url(line.strip()),
732 'protocol': entry_protocol,
733 'preference': preference,
735 codecs = last_info.get('CODECS')
737 # TODO: looks like video codec is not always necessarily goes first
738 va_codecs = codecs.split(',')
740 f['vcodec'] = va_codecs[0].partition('.')[0]
741 if len(va_codecs) > 1 and va_codecs[1]:
742 f['acodec'] = va_codecs[1].partition('.')[0]
743 resolution = last_info.get('RESOLUTION')
745 width_str, height_str = resolution.split('x')
746 f['width'] = int(width_str)
747 f['height'] = int(height_str)
750 self._sort_formats(formats)
753 def _live_title(self, name):
754 """ Generate the title for a live video """
755 now = datetime.datetime.now()
756 now_str = now.strftime("%Y-%m-%d %H:%M")
757 return name + ' ' + now_str
759 def _int(self, v, name, fatal=False, **kwargs):
760 res = int_or_none(v, **kwargs)
761 if 'get_attr' in kwargs:
762 print(getattr(v, kwargs['get_attr']))
764 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
766 raise ExtractorError(msg)
768 self._downloader.report_warning(msg)
771 def _float(self, v, name, fatal=False, **kwargs):
772 res = float_or_none(v, **kwargs)
774 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
776 raise ExtractorError(msg)
778 self._downloader.report_warning(msg)
782 class SearchInfoExtractor(InfoExtractor):
784 Base class for paged search queries extractors.
785 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
786 Instances should define _SEARCH_KEY and _MAX_RESULTS.
790 def _make_valid_url(cls):
791 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
794 def suitable(cls, url):
795 return re.match(cls._make_valid_url(), url) is not None
797 def _real_extract(self, query):
798 mobj = re.match(self._make_valid_url(), query)
800 raise ExtractorError('Invalid search query "%s"' % query)
802 prefix = mobj.group('prefix')
803 query = mobj.group('query')
805 return self._get_n_results(query, 1)
806 elif prefix == 'all':
807 return self._get_n_results(query, self._MAX_RESULTS)
811 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
812 elif n > self._MAX_RESULTS:
813 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
814 n = self._MAX_RESULTS
815 return self._get_n_results(query, n)
817 def _get_n_results(self, query, n):
818 """Get a specified number of results for a query"""
819 raise NotImplementedError("This method must be implemented by subclasses")
822 def SEARCH_KEY(self):
823 return self._SEARCH_KEY