1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
19 compat_urllib_parse_urlparse,
33 _NO_DEFAULT = object()
36 class InfoExtractor(object):
37 """Information Extractor class.
39 Information extractors are the classes that, given a URL, extract
40 information about the video (or videos) the URL refers to. This
41 information includes the real video URL, the video title, author and
42 others. The information is stored in a dictionary which is then
43 passed to the FileDownloader. The FileDownloader processes this
44 information possibly downloading the video to the file system, among
45 other possible outcomes.
47 The type field determines the the type of the result.
48 By far the most common value (and the default if _type is missing) is
49 "video", which indicates a single video.
51 For a video, the dictionaries must include the following fields:
54 title: Video title, unescaped.
56 Additionally, it must contain either a formats entry or a url one:
58 formats: A list of dictionaries for each format available, ordered
59 from worst to best quality.
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19").
70 Technically optional, but strongly recommended.
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * resolution Textual description of width and height
76 * tbr Average bitrate of audio and video in KBit/s
77 * abr Average audio bitrate in KBit/s
78 * acodec Name of the audio codec in use
79 * asr Audio sampling rate in Hertz
80 * vbr Average video bitrate in KBit/s
82 * vcodec Name of the video codec in use
83 * container Name of the container format
84 * filesize The number of bytes, if known in advance
85 * filesize_approx An estimate for the number of bytes
86 * player_url SWF Player URL (used for rtmpdump).
87 * protocol The protocol that will be used for the actual
89 "http", "https", "rtsp", "rtmp", "m3u8" or so.
90 * preference Order number of this format. If this field is
91 present and not None, the formats get sorted
92 by this field, regardless of all other values.
93 -1 for default (order by other properties),
94 -2 or smaller for less than default.
95 * language_preference Is this in the correct requested
97 10 if it's what the URL is about,
98 -1 for default (don't know),
99 -10 otherwise, other values reserved for now.
100 * quality Order number of the video quality of this
101 format, irrespective of the file format.
102 -1 for default (order by other properties),
103 -2 or smaller for less than default.
104 * source_preference Order number for this video source
105 (quality takes higher priority)
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 * http_referer HTTP Referer header value to set.
109 * http_method HTTP method to use for the download.
110 * http_headers A dictionary of additional HTTP headers
111 to add to the request.
112 * http_post_data Additional data to send with a POST
114 url: Final video URL.
115 ext: Video filename extension.
116 format: The video format, defaults to ext (used for --get-format)
117 player_url: SWF Player URL (used for rtmpdump).
119 The following fields are optional:
121 display_id An alternative identifier for the video, not necessarily
122 unique, but available before title. Typically, id is
123 something like "4234987", title "Dancing naked mole rats",
124 and display_id "dancing-naked-mole-rats"
125 thumbnails: A list of dictionaries, with the following entries:
127 * "width" (optional, int)
128 * "height" (optional, int)
129 * "resolution" (optional, string "{width}x{height"},
131 thumbnail: Full URL to a video thumbnail image.
132 description: One-line video description.
133 uploader: Full name of the video uploader.
134 timestamp: UNIX timestamp of the moment the video became available.
135 upload_date: Video upload date (YYYYMMDD).
136 If not explicitly set, calculated from timestamp.
137 uploader_id: Nickname or id of the video uploader.
138 location: Physical location where the video was filmed.
139 subtitles: The subtitle file contents as a dictionary in the format
140 {language: subtitles}.
141 duration: Length of the video in seconds, as an integer.
142 view_count: How many users have watched the video on the platform.
143 like_count: Number of positive ratings of the video
144 dislike_count: Number of negative ratings of the video
145 comment_count: Number of comments on the video
146 age_limit: Age restriction for the video, as an integer (years)
147 webpage_url: The url to the video webpage, if given to youtube-dl it
148 should allow to get the same result again. (It will be set
149 by YoutubeDL if it's missing)
150 categories: A list of categories that the video falls in, for example
152 is_live: True, False, or None (=unknown). Whether this video is a
153 live stream that goes on instead of a fixed-length video.
155 Unless mentioned otherwise, the fields should be Unicode strings.
157 Unless mentioned otherwise, None is equivalent to absence of information.
160 _type "playlist" indicates multiple videos.
161 There must be a key "entries", which is a list, an iterable, or a PagedList
162 object, each element of which is a valid dictionary by this specification.
164 Additionally, playlists can have "title" and "id" attributes with the same
165 semantics as videos (see above).
168 _type "multi_video" indicates that there are multiple videos that
169 form a single show, for examples multiple acts of an opera or TV episode.
170 It must have an entries key like a playlist and contain all the keys
171 required for a video at the same time.
174 _type "url" indicates that the video must be extracted from another
175 location, possibly by a different extractor. Its only required key is:
176 "url" - the next URL to extract.
178 Additionally, it may have properties believed to be identical to the
179 resolved entity, for example "title" if the title of the referred video is
183 _type "url_transparent" entities have the same specification as "url", but
184 indicate that the given additional information is more precise than the one
185 associated with the resolved URL.
186 This is useful when a site employs a video service that hosts the video and
187 its technical metadata, but that video service does not embed a useful
188 title, description etc.
191 Subclasses of this one should re-define the _real_initialize() and
192 _real_extract() methods and define a _VALID_URL regexp.
193 Probably, they should also be added to the list of extractors.
195 Finally, the _WORKING attribute should be set to False for broken IEs
196 in order to warn the users and skip the tests.
203 def __init__(self, downloader=None):
204 """Constructor. Receives an optional downloader."""
206 self.set_downloader(downloader)
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
212 # This does not use has/getattr intentionally - we want to know whether
213 # we have cached the regexp for *this* class, whereas getattr would also
214 # match the superclass
215 if '_VALID_URL_RE' not in cls.__dict__:
216 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
217 return cls._VALID_URL_RE.match(url) is not None
220 def _match_id(cls, url):
221 if '_VALID_URL_RE' not in cls.__dict__:
222 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
223 m = cls._VALID_URL_RE.match(url)
229 """Getter method for _WORKING."""
232 def initialize(self):
233 """Initializes an instance (authentication, etc)."""
235 self._real_initialize()
238 def extract(self, url):
239 """Extracts URL information and returns it in list of dicts."""
241 return self._real_extract(url)
243 def set_downloader(self, downloader):
244 """Sets the downloader for this IE."""
245 self._downloader = downloader
247 def _real_initialize(self):
248 """Real initialization process. Redefine in subclasses."""
251 def _real_extract(self, url):
252 """Real extraction process. Redefine in subclasses."""
257 """A string for getting the InfoExtractor with get_info_extractor"""
258 return cls.__name__[:-2]
262 return type(self).__name__[:-2]
264 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
265 """ Returns the response handle """
267 self.report_download_webpage(video_id)
268 elif note is not False:
270 self.to_screen('%s' % (note,))
272 self.to_screen('%s: %s' % (video_id, note))
274 return self._downloader.urlopen(url_or_request)
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279 errnote = 'Unable to download webpage'
280 errmsg = '%s: %s' % (errnote, compat_str(err))
282 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
284 self._downloader.report_warning(errmsg)
287 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
288 """ Returns a tuple (page content as string, URL handle) """
289 # Strip hashes from the URL (#1038)
290 if isinstance(url_or_request, (compat_str, str)):
291 url_or_request = url_or_request.partition('#')[0]
293 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
297 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
298 return (content, urlh)
300 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
301 content_type = urlh.headers.get('Content-Type', '')
302 webpage_bytes = urlh.read()
303 if prefix is not None:
304 webpage_bytes = prefix + webpage_bytes
305 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
307 encoding = m.group(1)
309 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
310 webpage_bytes[:1024])
312 encoding = m.group(1).decode('ascii')
313 elif webpage_bytes.startswith(b'\xff\xfe'):
317 if self._downloader.params.get('dump_intermediate_pages', False):
319 url = url_or_request.get_full_url()
320 except AttributeError:
322 self.to_screen('Dumping request to ' + url)
323 dump = base64.b64encode(webpage_bytes).decode('ascii')
324 self._downloader.to_screen(dump)
325 if self._downloader.params.get('write_pages', False):
327 url = url_or_request.get_full_url()
328 except AttributeError:
330 basen = '%s_%s' % (video_id, url)
332 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
333 basen = basen[:240 - len(h)] + h
334 raw_filename = basen + '.dump'
335 filename = sanitize_filename(raw_filename, restricted=True)
336 self.to_screen('Saving request to ' + filename)
337 # Working around MAX_PATH limitation on Windows (see
338 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
340 absfilepath = os.path.abspath(filename)
341 if len(absfilepath) > 259:
342 filename = '\\\\?\\' + absfilepath
343 with open(filename, 'wb') as outf:
344 outf.write(webpage_bytes)
347 content = webpage_bytes.decode(encoding, 'replace')
349 content = webpage_bytes.decode('utf-8', 'replace')
351 if ('<title>Access to this site is blocked</title>' in content and
352 'Websense' in content[:512]):
353 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
354 blocked_iframe = self._html_search_regex(
355 r'<iframe src="([^"]+)"', content,
356 'Websense information URL', default=None)
358 msg += ' Visit %s for more details' % blocked_iframe
359 raise ExtractorError(msg, expected=True)
363 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
364 """ Returns the data of the page as a string """
365 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
372 def _download_xml(self, url_or_request, video_id,
373 note='Downloading XML', errnote='Unable to download XML',
374 transform_source=None, fatal=True):
375 """Return the xml as an xml.etree.ElementTree.Element"""
376 xml_string = self._download_webpage(
377 url_or_request, video_id, note, errnote, fatal=fatal)
378 if xml_string is False:
381 xml_string = transform_source(xml_string)
382 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
384 def _download_json(self, url_or_request, video_id,
385 note='Downloading JSON metadata',
386 errnote='Unable to download JSON metadata',
387 transform_source=None,
389 json_string = self._download_webpage(
390 url_or_request, video_id, note, errnote, fatal=fatal)
391 if (not fatal) and json_string is False:
394 json_string = transform_source(json_string)
396 return json.loads(json_string)
397 except ValueError as ve:
398 errmsg = '%s: Failed to parse JSON ' % video_id
400 raise ExtractorError(errmsg, cause=ve)
402 self.report_warning(errmsg + str(ve))
404 def report_warning(self, msg, video_id=None):
405 idstr = '' if video_id is None else '%s: ' % video_id
406 self._downloader.report_warning(
407 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
409 def to_screen(self, msg):
410 """Print msg to screen, prefixing it with '[ie_name]'"""
411 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
413 def report_extraction(self, id_or_name):
414 """Report information extraction."""
415 self.to_screen('%s: Extracting information' % id_or_name)
417 def report_download_webpage(self, video_id):
418 """Report webpage download."""
419 self.to_screen('%s: Downloading webpage' % video_id)
421 def report_age_confirmation(self):
422 """Report attempt to confirm age."""
423 self.to_screen('Confirming age')
425 def report_login(self):
426 """Report attempt to log in."""
427 self.to_screen('Logging in')
429 # Methods for following #608
431 def url_result(url, ie=None, video_id=None):
432 """Returns a url that points to a page that should be processed"""
433 # TODO: ie should be the class used for getting the info
434 video_info = {'_type': 'url',
437 if video_id is not None:
438 video_info['id'] = video_id
442 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
443 """Returns a playlist"""
444 video_info = {'_type': 'playlist',
447 video_info['id'] = playlist_id
449 video_info['title'] = playlist_title
450 if playlist_description:
451 video_info['description'] = playlist_description
454 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
456 Perform a regex search on the given string, using a single or a list of
457 patterns returning the first matching group.
458 In case of failure return a default value or raise a WARNING or a
459 RegexNotFoundError, depending on fatal, specifying the field name.
461 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
462 mobj = re.search(pattern, string, flags)
465 mobj = re.search(p, string, flags)
469 if os.name != 'nt' and sys.stderr.isatty():
470 _name = '\033[0;34m%s\033[0m' % name
476 # return the first matching group
477 return next(g for g in mobj.groups() if g is not None)
479 return mobj.group(group)
480 elif default is not _NO_DEFAULT:
483 raise RegexNotFoundError('Unable to extract %s' % _name)
485 self._downloader.report_warning('unable to extract %s; '
486 'please report this issue on http://yt-dl.org/bug' % _name)
489 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
491 Like _search_regex, but strips HTML tags and unescapes entities.
493 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
495 return clean_html(res).strip()
499 def _get_login_info(self):
501 Get the the login info as (username, password)
502 It will look in the netrc file using the _NETRC_MACHINE value
503 If there's no info available, return (None, None)
505 if self._downloader is None:
510 downloader_params = self._downloader.params
512 # Attempt to use provided username and password or .netrc data
513 if downloader_params.get('username', None) is not None:
514 username = downloader_params['username']
515 password = downloader_params['password']
516 elif downloader_params.get('usenetrc', False):
518 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
523 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
524 except (IOError, netrc.NetrcParseError) as err:
525 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
527 return (username, password)
529 def _get_tfa_info(self):
531 Get the two-factor authentication info
532 TODO - asking the user will be required for sms/phone verify
533 currently just uses the command line option
534 If there's no info available, return None
536 if self._downloader is None:
538 downloader_params = self._downloader.params
540 if downloader_params.get('twofactor', None) is not None:
541 return downloader_params['twofactor']
545 # Helper functions for extracting OpenGraph info
547 def _og_regexes(prop):
548 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
549 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
550 template = r'<meta[^>]+?%s[^>]+?%s'
552 template % (property_re, content_re),
553 template % (content_re, property_re),
556 def _og_search_property(self, prop, html, name=None, **kargs):
558 name = 'OpenGraph %s' % prop
559 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
562 return unescapeHTML(escaped)
564 def _og_search_thumbnail(self, html, **kargs):
565 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
567 def _og_search_description(self, html, **kargs):
568 return self._og_search_property('description', html, fatal=False, **kargs)
570 def _og_search_title(self, html, **kargs):
571 return self._og_search_property('title', html, **kargs)
573 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
574 regexes = self._og_regexes('video') + self._og_regexes('video:url')
576 regexes = self._og_regexes('video:secure_url') + regexes
577 return self._html_search_regex(regexes, html, name, **kargs)
579 def _og_search_url(self, html, **kargs):
580 return self._og_search_property('url', html, **kargs)
582 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
583 if display_name is None:
585 return self._html_search_regex(
587 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
588 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
589 html, display_name, fatal=fatal, group='content', **kwargs)
591 def _dc_search_uploader(self, html):
592 return self._html_search_meta('dc.creator', html, 'uploader')
594 def _rta_search(self, html):
595 # See http://www.rtalabel.org/index.php?content=howtofaq#single
596 if re.search(r'(?ix)<meta\s+name="rating"\s+'
597 r' content="RTA-5042-1996-1400-1577-RTA"',
602 def _media_rating_search(self, html):
603 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
604 rating = self._html_search_meta('rating', html)
616 return RATING_TABLE.get(rating.lower(), None)
618 def _twitter_search_player(self, html):
619 return self._html_search_meta('twitter:player', html,
620 'twitter card player')
622 def _sort_formats(self, formats):
624 raise ExtractorError('No video formats found')
627 # TODO remove the following workaround
628 from ..utils import determine_ext
629 if not f.get('ext') and 'url' in f:
630 f['ext'] = determine_ext(f['url'])
632 preference = f.get('preference')
633 if preference is None:
634 proto = f.get('protocol')
636 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
638 preference = 0 if proto in ['http', 'https'] else -0.1
639 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
642 if f.get('vcodec') == 'none': # audio only
643 if self._downloader.params.get('prefer_free_formats'):
644 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
646 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
649 audio_ext_preference = ORDER.index(f['ext'])
651 audio_ext_preference = -1
653 if self._downloader.params.get('prefer_free_formats'):
654 ORDER = ['flv', 'mp4', 'webm']
656 ORDER = ['webm', 'flv', 'mp4']
658 ext_preference = ORDER.index(f['ext'])
661 audio_ext_preference = 0
665 f.get('language_preference') if f.get('language_preference') is not None else -1,
666 f.get('quality') if f.get('quality') is not None else -1,
667 f.get('height') if f.get('height') is not None else -1,
668 f.get('width') if f.get('width') is not None else -1,
670 f.get('tbr') if f.get('tbr') is not None else -1,
671 f.get('vbr') if f.get('vbr') is not None else -1,
672 f.get('abr') if f.get('abr') is not None else -1,
673 audio_ext_preference,
674 f.get('fps') if f.get('fps') is not None else -1,
675 f.get('filesize') if f.get('filesize') is not None else -1,
676 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
677 f.get('source_preference') if f.get('source_preference') is not None else -1,
680 formats.sort(key=_formats_key)
682 def http_scheme(self):
683 """ Either "http:" or "https:", depending on the user's preferences """
686 if self._downloader.params.get('prefer_insecure', False)
689 def _proto_relative_url(self, url, scheme=None):
692 if url.startswith('//'):
694 scheme = self.http_scheme()
699 def _sleep(self, timeout, video_id, msg_template=None):
700 if msg_template is None:
701 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
702 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
706 def _extract_f4m_formats(self, manifest_url, video_id):
707 manifest = self._download_xml(
708 manifest_url, video_id, 'Downloading f4m manifest',
709 'Unable to download f4m manifest')
712 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
713 for i, media_el in enumerate(media_nodes):
714 tbr = int_or_none(media_el.attrib.get('bitrate'))
715 format_id = 'f4m-%d' % (i if tbr is None else tbr)
717 'format_id': format_id,
721 'width': int_or_none(media_el.attrib.get('width')),
722 'height': int_or_none(media_el.attrib.get('height')),
724 self._sort_formats(formats)
728 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
729 entry_protocol='m3u8', preference=None):
732 'format_id': 'm3u8-meta',
737 'resolution': 'multiple',
738 'format_note': 'Quality selection URL',
741 format_url = lambda u: (
743 if re.match(r'^https?://', u)
744 else compat_urlparse.urljoin(m3u8_url, u))
746 m3u8_doc = self._download_webpage(
748 note='Downloading m3u8 information',
749 errnote='Failed to download m3u8 information')
752 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
753 for line in m3u8_doc.splitlines():
754 if line.startswith('#EXT-X-STREAM-INF:'):
756 for m in kv_rex.finditer(line):
758 if v.startswith('"'):
760 last_info[m.group('key')] = v
761 elif line.startswith('#') or not line.strip():
764 if last_info is None:
765 formats.append({'url': format_url(line)})
767 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
770 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
771 'url': format_url(line.strip()),
774 'protocol': entry_protocol,
775 'preference': preference,
777 codecs = last_info.get('CODECS')
779 # TODO: looks like video codec is not always necessarily goes first
780 va_codecs = codecs.split(',')
782 f['vcodec'] = va_codecs[0].partition('.')[0]
783 if len(va_codecs) > 1 and va_codecs[1]:
784 f['acodec'] = va_codecs[1].partition('.')[0]
785 resolution = last_info.get('RESOLUTION')
787 width_str, height_str = resolution.split('x')
788 f['width'] = int(width_str)
789 f['height'] = int(height_str)
792 self._sort_formats(formats)
795 def _live_title(self, name):
796 """ Generate the title for a live video """
797 now = datetime.datetime.now()
798 now_str = now.strftime("%Y-%m-%d %H:%M")
799 return name + ' ' + now_str
801 def _int(self, v, name, fatal=False, **kwargs):
802 res = int_or_none(v, **kwargs)
803 if 'get_attr' in kwargs:
804 print(getattr(v, kwargs['get_attr']))
806 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
808 raise ExtractorError(msg)
810 self._downloader.report_warning(msg)
813 def _float(self, v, name, fatal=False, **kwargs):
814 res = float_or_none(v, **kwargs)
816 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
818 raise ExtractorError(msg)
820 self._downloader.report_warning(msg)
823 def _set_cookie(self, domain, name, value, expire_time=None):
824 cookie = compat_cookiejar.Cookie(
825 0, name, value, None, None, domain, None,
826 None, '/', True, False, expire_time, '', None, None, None)
827 self._downloader.cookiejar.set_cookie(cookie)
830 class SearchInfoExtractor(InfoExtractor):
832 Base class for paged search queries extractors.
833 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
834 Instances should define _SEARCH_KEY and _MAX_RESULTS.
838 def _make_valid_url(cls):
839 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
842 def suitable(cls, url):
843 return re.match(cls._make_valid_url(), url) is not None
845 def _real_extract(self, query):
846 mobj = re.match(self._make_valid_url(), query)
848 raise ExtractorError('Invalid search query "%s"' % query)
850 prefix = mobj.group('prefix')
851 query = mobj.group('query')
853 return self._get_n_results(query, 1)
854 elif prefix == 'all':
855 return self._get_n_results(query, self._MAX_RESULTS)
859 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
860 elif n > self._MAX_RESULTS:
861 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
862 n = self._MAX_RESULTS
863 return self._get_n_results(query, n)
865 def _get_n_results(self, query, n):
866 """Get a specified number of results for a query"""
867 raise NotImplementedError("This method must be implemented by subclasses")
870 def SEARCH_KEY(self):
871 return self._SEARCH_KEY