1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
20 compat_urllib_parse_urlparse,
36 _NO_DEFAULT = object()
39 class InfoExtractor(object):
40 """Information Extractor class.
42 Information extractors are the classes that, given a URL, extract
43 information about the video (or videos) the URL refers to. This
44 information includes the real video URL, the video title, author and
45 others. The information is stored in a dictionary which is then
46 passed to the YoutubeDL. The YoutubeDL processes this
47 information possibly downloading the video to the file system, among
48 other possible outcomes.
50 The type field determines the the type of the result.
51 By far the most common value (and the default if _type is missing) is
52 "video", which indicates a single video.
54 For a video, the dictionaries must include the following fields:
57 title: Video title, unescaped.
59 Additionally, it must contain either a formats entry or a url one:
61 formats: A list of dictionaries for each format available, ordered
62 from worst to best quality.
65 * url Mandatory. The URL of the video file
66 * ext Will be calculated from url if missing
67 * format A human-readable description of the format
68 ("mp4 container with h264/opus").
69 Calculated from the format_id, width, height.
70 and format_note fields if missing.
71 * format_id A short description of the format
72 ("mp4_h264_opus" or "19").
73 Technically optional, but strongly recommended.
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
78 * resolution Textual description of width and height
79 * tbr Average bitrate of audio and video in KBit/s
80 * abr Average audio bitrate in KBit/s
81 * acodec Name of the audio codec in use
82 * asr Audio sampling rate in Hertz
83 * vbr Average video bitrate in KBit/s
85 * vcodec Name of the video codec in use
86 * container Name of the container format
87 * filesize The number of bytes, if known in advance
88 * filesize_approx An estimate for the number of bytes
89 * player_url SWF Player URL (used for rtmpdump).
90 * protocol The protocol that will be used for the actual
92 "http", "https", "rtsp", "rtmp", "rtmpe",
93 "m3u8", or "m3u8_native".
94 * preference Order number of this format. If this field is
95 present and not None, the formats get sorted
96 by this field, regardless of all other values.
97 -1 for default (order by other properties),
98 -2 or smaller for less than default.
99 < -1000 to hide the format (if there is
100 another one which is strictly better)
101 * language_preference Is this in the correct requested
103 10 if it's what the URL is about,
104 -1 for default (don't know),
105 -10 otherwise, other values reserved for now.
106 * quality Order number of the video quality of this
107 format, irrespective of the file format.
108 -1 for default (order by other properties),
109 -2 or smaller for less than default.
110 * source_preference Order number for this video source
111 (quality takes higher priority)
112 -1 for default (order by other properties),
113 -2 or smaller for less than default.
114 * http_method HTTP method to use for the download.
115 * http_headers A dictionary of additional HTTP headers
116 to add to the request.
117 * http_post_data Additional data to send with a POST
119 * stretched_ratio If given and not 1, indicates that the
120 video's pixels are not square.
121 width : height ratio as float.
122 * no_resume The server does not support resuming the
123 (HTTP or RTMP) download. Boolean.
125 url: Final video URL.
126 ext: Video filename extension.
127 format: The video format, defaults to ext (used for --get-format)
128 player_url: SWF Player URL (used for rtmpdump).
130 The following fields are optional:
132 alt_title: A secondary title of the video.
133 display_id An alternative identifier for the video, not necessarily
134 unique, but available before title. Typically, id is
135 something like "4234987", title "Dancing naked mole rats",
136 and display_id "dancing-naked-mole-rats"
137 thumbnails: A list of dictionaries, with the following entries:
138 * "id" (optional, string) - Thumbnail format ID
140 * "preference" (optional, int) - quality of the image
141 * "width" (optional, int)
142 * "height" (optional, int)
143 * "resolution" (optional, string "{width}x{height"},
145 thumbnail: Full URL to a video thumbnail image.
146 description: Full video description.
147 uploader: Full name of the video uploader.
148 creator: The main artist who created the video.
149 timestamp: UNIX timestamp of the moment the video became available.
150 upload_date: Video upload date (YYYYMMDD).
151 If not explicitly set, calculated from timestamp.
152 uploader_id: Nickname or id of the video uploader.
153 location: Physical location where the video was filmed.
154 subtitles: The subtitle file contents as a dictionary in the format
155 {language: subtitles}.
156 duration: Length of the video in seconds, as an integer.
157 view_count: How many users have watched the video on the platform.
158 like_count: Number of positive ratings of the video
159 dislike_count: Number of negative ratings of the video
160 average_rating: Average rating give by users, the scale used depends on the webpage
161 comment_count: Number of comments on the video
162 comments: A list of comments, each with one or more of the following
163 properties (all but one of text or html optional):
164 * "author" - human-readable name of the comment author
165 * "author_id" - user ID of the comment author
167 * "html" - Comment as HTML
168 * "text" - Plain text of the comment
169 * "timestamp" - UNIX timestamp of comment
170 * "parent" - ID of the comment this one is replying to.
171 Set to "root" to indicate that this is a
172 comment to the original video.
173 age_limit: Age restriction for the video, as an integer (years)
174 webpage_url: The url to the video webpage, if given to youtube-dl it
175 should allow to get the same result again. (It will be set
176 by YoutubeDL if it's missing)
177 categories: A list of categories that the video falls in, for example
179 is_live: True, False, or None (=unknown). Whether this video is a
180 live stream that goes on instead of a fixed-length video.
182 Unless mentioned otherwise, the fields should be Unicode strings.
184 Unless mentioned otherwise, None is equivalent to absence of information.
187 _type "playlist" indicates multiple videos.
188 There must be a key "entries", which is a list, an iterable, or a PagedList
189 object, each element of which is a valid dictionary by this specification.
191 Additionally, playlists can have "title" and "id" attributes with the same
192 semantics as videos (see above).
195 _type "multi_video" indicates that there are multiple videos that
196 form a single show, for examples multiple acts of an opera or TV episode.
197 It must have an entries key like a playlist and contain all the keys
198 required for a video at the same time.
201 _type "url" indicates that the video must be extracted from another
202 location, possibly by a different extractor. Its only required key is:
203 "url" - the next URL to extract.
204 The key "ie_key" can be set to the class name (minus the trailing "IE",
205 e.g. "Youtube") if the extractor class is known in advance.
206 Additionally, the dictionary may have any properties of the resolved entity
207 known in advance, for example "title" if the title of the referred video is
211 _type "url_transparent" entities have the same specification as "url", but
212 indicate that the given additional information is more precise than the one
213 associated with the resolved URL.
214 This is useful when a site employs a video service that hosts the video and
215 its technical metadata, but that video service does not embed a useful
216 title, description etc.
219 Subclasses of this one should re-define the _real_initialize() and
220 _real_extract() methods and define a _VALID_URL regexp.
221 Probably, they should also be added to the list of extractors.
223 Finally, the _WORKING attribute should be set to False for broken IEs
224 in order to warn the users and skip the tests.
231 def __init__(self, downloader=None):
232 """Constructor. Receives an optional downloader."""
234 self.set_downloader(downloader)
237 def suitable(cls, url):
238 """Receives a URL and returns True if suitable for this IE."""
240 # This does not use has/getattr intentionally - we want to know whether
241 # we have cached the regexp for *this* class, whereas getattr would also
242 # match the superclass
243 if '_VALID_URL_RE' not in cls.__dict__:
244 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
245 return cls._VALID_URL_RE.match(url) is not None
248 def _match_id(cls, url):
249 if '_VALID_URL_RE' not in cls.__dict__:
250 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
251 m = cls._VALID_URL_RE.match(url)
257 """Getter method for _WORKING."""
260 def initialize(self):
261 """Initializes an instance (authentication, etc)."""
263 self._real_initialize()
266 def extract(self, url):
267 """Extracts URL information and returns it in list of dicts."""
270 return self._real_extract(url)
271 except ExtractorError:
273 except compat_http_client.IncompleteRead as e:
274 raise ExtractorError('A network error has occured.', cause=e, expected=True)
275 except (KeyError, StopIteration) as e:
276 raise ExtractorError('An extractor error has occured.', cause=e)
278 def set_downloader(self, downloader):
279 """Sets the downloader for this IE."""
280 self._downloader = downloader
282 def _real_initialize(self):
283 """Real initialization process. Redefine in subclasses."""
286 def _real_extract(self, url):
287 """Real extraction process. Redefine in subclasses."""
292 """A string for getting the InfoExtractor with get_info_extractor"""
293 return cls.__name__[:-2]
297 return type(self).__name__[:-2]
299 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
300 """ Returns the response handle """
302 self.report_download_webpage(video_id)
303 elif note is not False:
305 self.to_screen('%s' % (note,))
307 self.to_screen('%s: %s' % (video_id, note))
309 return self._downloader.urlopen(url_or_request)
310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 errnote = 'Unable to download webpage'
315 errmsg = '%s: %s' % (errnote, compat_str(err))
317 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
319 self._downloader.report_warning(errmsg)
322 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
323 """ Returns a tuple (page content as string, URL handle) """
324 # Strip hashes from the URL (#1038)
325 if isinstance(url_or_request, (compat_str, str)):
326 url_or_request = url_or_request.partition('#')[0]
328 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
332 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
333 return (content, urlh)
335 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
336 content_type = urlh.headers.get('Content-Type', '')
337 webpage_bytes = urlh.read()
338 if prefix is not None:
339 webpage_bytes = prefix + webpage_bytes
340 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
342 encoding = m.group(1)
344 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
345 webpage_bytes[:1024])
347 encoding = m.group(1).decode('ascii')
348 elif webpage_bytes.startswith(b'\xff\xfe'):
352 if self._downloader.params.get('dump_intermediate_pages', False):
354 url = url_or_request.get_full_url()
355 except AttributeError:
357 self.to_screen('Dumping request to ' + url)
358 dump = base64.b64encode(webpage_bytes).decode('ascii')
359 self._downloader.to_screen(dump)
360 if self._downloader.params.get('write_pages', False):
362 url = url_or_request.get_full_url()
363 except AttributeError:
365 basen = '%s_%s' % (video_id, url)
367 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
368 basen = basen[:240 - len(h)] + h
369 raw_filename = basen + '.dump'
370 filename = sanitize_filename(raw_filename, restricted=True)
371 self.to_screen('Saving request to ' + filename)
372 # Working around MAX_PATH limitation on Windows (see
373 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
375 absfilepath = os.path.abspath(filename)
376 if len(absfilepath) > 259:
377 filename = '\\\\?\\' + absfilepath
378 with open(filename, 'wb') as outf:
379 outf.write(webpage_bytes)
382 content = webpage_bytes.decode(encoding, 'replace')
384 content = webpage_bytes.decode('utf-8', 'replace')
386 if ('<title>Access to this site is blocked</title>' in content and
387 'Websense' in content[:512]):
388 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
389 blocked_iframe = self._html_search_regex(
390 r'<iframe src="([^"]+)"', content,
391 'Websense information URL', default=None)
393 msg += ' Visit %s for more details' % blocked_iframe
394 raise ExtractorError(msg, expected=True)
398 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
399 """ Returns the data of the page as a string """
402 while success is False:
404 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
406 except compat_http_client.IncompleteRead as e:
408 if try_count >= tries:
410 self._sleep(timeout, video_id)
417 def _download_xml(self, url_or_request, video_id,
418 note='Downloading XML', errnote='Unable to download XML',
419 transform_source=None, fatal=True):
420 """Return the xml as an xml.etree.ElementTree.Element"""
421 xml_string = self._download_webpage(
422 url_or_request, video_id, note, errnote, fatal=fatal)
423 if xml_string is False:
426 xml_string = transform_source(xml_string)
427 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
429 def _download_json(self, url_or_request, video_id,
430 note='Downloading JSON metadata',
431 errnote='Unable to download JSON metadata',
432 transform_source=None,
434 json_string = self._download_webpage(
435 url_or_request, video_id, note, errnote, fatal=fatal)
436 if (not fatal) and json_string is False:
438 return self._parse_json(
439 json_string, video_id, transform_source=transform_source, fatal=fatal)
441 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
443 json_string = transform_source(json_string)
445 return json.loads(json_string)
446 except ValueError as ve:
447 errmsg = '%s: Failed to parse JSON ' % video_id
449 raise ExtractorError(errmsg, cause=ve)
451 self.report_warning(errmsg + str(ve))
453 def report_warning(self, msg, video_id=None):
454 idstr = '' if video_id is None else '%s: ' % video_id
455 self._downloader.report_warning(
456 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
458 def to_screen(self, msg):
459 """Print msg to screen, prefixing it with '[ie_name]'"""
460 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
462 def report_extraction(self, id_or_name):
463 """Report information extraction."""
464 self.to_screen('%s: Extracting information' % id_or_name)
466 def report_download_webpage(self, video_id):
467 """Report webpage download."""
468 self.to_screen('%s: Downloading webpage' % video_id)
470 def report_age_confirmation(self):
471 """Report attempt to confirm age."""
472 self.to_screen('Confirming age')
474 def report_login(self):
475 """Report attempt to log in."""
476 self.to_screen('Logging in')
478 # Methods for following #608
480 def url_result(url, ie=None, video_id=None):
481 """Returns a url that points to a page that should be processed"""
482 # TODO: ie should be the class used for getting the info
483 video_info = {'_type': 'url',
486 if video_id is not None:
487 video_info['id'] = video_id
491 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
492 """Returns a playlist"""
493 video_info = {'_type': 'playlist',
496 video_info['id'] = playlist_id
498 video_info['title'] = playlist_title
499 if playlist_description:
500 video_info['description'] = playlist_description
503 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
505 Perform a regex search on the given string, using a single or a list of
506 patterns returning the first matching group.
507 In case of failure return a default value or raise a WARNING or a
508 RegexNotFoundError, depending on fatal, specifying the field name.
510 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
511 mobj = re.search(pattern, string, flags)
514 mobj = re.search(p, string, flags)
518 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
519 _name = '\033[0;34m%s\033[0m' % name
525 # return the first matching group
526 return next(g for g in mobj.groups() if g is not None)
528 return mobj.group(group)
529 elif default is not _NO_DEFAULT:
532 raise RegexNotFoundError('Unable to extract %s' % _name)
534 self._downloader.report_warning('unable to extract %s; '
535 'please report this issue on http://yt-dl.org/bug' % _name)
538 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
540 Like _search_regex, but strips HTML tags and unescapes entities.
542 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
544 return clean_html(res).strip()
548 def _get_login_info(self):
550 Get the the login info as (username, password)
551 It will look in the netrc file using the _NETRC_MACHINE value
552 If there's no info available, return (None, None)
554 if self._downloader is None:
559 downloader_params = self._downloader.params
561 # Attempt to use provided username and password or .netrc data
562 if downloader_params.get('username', None) is not None:
563 username = downloader_params['username']
564 password = downloader_params['password']
565 elif downloader_params.get('usenetrc', False):
567 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
572 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
573 except (IOError, netrc.NetrcParseError) as err:
574 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
576 return (username, password)
578 def _get_tfa_info(self):
580 Get the two-factor authentication info
581 TODO - asking the user will be required for sms/phone verify
582 currently just uses the command line option
583 If there's no info available, return None
585 if self._downloader is None:
587 downloader_params = self._downloader.params
589 if downloader_params.get('twofactor', None) is not None:
590 return downloader_params['twofactor']
594 # Helper functions for extracting OpenGraph info
596 def _og_regexes(prop):
597 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
598 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
599 template = r'<meta[^>]+?%s[^>]+?%s'
601 template % (property_re, content_re),
602 template % (content_re, property_re),
605 def _og_search_property(self, prop, html, name=None, **kargs):
607 name = 'OpenGraph %s' % prop
608 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
611 return unescapeHTML(escaped)
613 def _og_search_thumbnail(self, html, **kargs):
614 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
616 def _og_search_description(self, html, **kargs):
617 return self._og_search_property('description', html, fatal=False, **kargs)
619 def _og_search_title(self, html, **kargs):
620 return self._og_search_property('title', html, **kargs)
622 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
623 regexes = self._og_regexes('video') + self._og_regexes('video:url')
625 regexes = self._og_regexes('video:secure_url') + regexes
626 return self._html_search_regex(regexes, html, name, **kargs)
628 def _og_search_url(self, html, **kargs):
629 return self._og_search_property('url', html, **kargs)
631 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
632 if display_name is None:
634 return self._html_search_regex(
636 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
637 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
638 html, display_name, fatal=fatal, group='content', **kwargs)
640 def _dc_search_uploader(self, html):
641 return self._html_search_meta('dc.creator', html, 'uploader')
643 def _rta_search(self, html):
644 # See http://www.rtalabel.org/index.php?content=howtofaq#single
645 if re.search(r'(?ix)<meta\s+name="rating"\s+'
646 r' content="RTA-5042-1996-1400-1577-RTA"',
651 def _media_rating_search(self, html):
652 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
653 rating = self._html_search_meta('rating', html)
665 return RATING_TABLE.get(rating.lower(), None)
667 def _family_friendly_search(self, html):
668 # See http://schema.org/VideoObject
669 family_friendly = self._html_search_meta('isFamilyFriendly', html)
671 if not family_friendly:
680 return RATING_TABLE.get(family_friendly.lower(), None)
682 def _twitter_search_player(self, html):
683 return self._html_search_meta('twitter:player', html,
684 'twitter card player')
686 def _sort_formats(self, formats):
688 raise ExtractorError('No video formats found')
691 # TODO remove the following workaround
692 from ..utils import determine_ext
693 if not f.get('ext') and 'url' in f:
694 f['ext'] = determine_ext(f['url'])
696 preference = f.get('preference')
697 if preference is None:
698 proto = f.get('protocol')
700 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
702 preference = 0 if proto in ['http', 'https'] else -0.1
703 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
706 if f.get('vcodec') == 'none': # audio only
707 if self._downloader.params.get('prefer_free_formats'):
708 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
710 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
713 audio_ext_preference = ORDER.index(f['ext'])
715 audio_ext_preference = -1
717 if self._downloader.params.get('prefer_free_formats'):
718 ORDER = ['flv', 'mp4', 'webm']
720 ORDER = ['webm', 'flv', 'mp4']
722 ext_preference = ORDER.index(f['ext'])
725 audio_ext_preference = 0
729 f.get('language_preference') if f.get('language_preference') is not None else -1,
730 f.get('quality') if f.get('quality') is not None else -1,
731 f.get('tbr') if f.get('tbr') is not None else -1,
732 f.get('filesize') if f.get('filesize') is not None else -1,
733 f.get('vbr') if f.get('vbr') is not None else -1,
734 f.get('height') if f.get('height') is not None else -1,
735 f.get('width') if f.get('width') is not None else -1,
737 f.get('abr') if f.get('abr') is not None else -1,
738 audio_ext_preference,
739 f.get('fps') if f.get('fps') is not None else -1,
740 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
741 f.get('source_preference') if f.get('source_preference') is not None else -1,
744 formats.sort(key=_formats_key)
746 def _check_formats(self, formats, video_id):
749 lambda f: self._is_valid_url(
751 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
754 def _is_valid_url(self, url, video_id, item='video'):
756 self._request_webpage(
757 HEADRequest(url), video_id,
758 'Checking %s URL' % item)
760 except ExtractorError as e:
761 if isinstance(e.cause, compat_HTTPError):
763 '%s URL is invalid, skipping' % item, video_id)
767 def http_scheme(self):
768 """ Either "http:" or "https:", depending on the user's preferences """
771 if self._downloader.params.get('prefer_insecure', False)
774 def _proto_relative_url(self, url, scheme=None):
777 if url.startswith('//'):
779 scheme = self.http_scheme()
784 def _sleep(self, timeout, video_id, msg_template=None):
785 if msg_template is None:
786 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
787 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
791 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
792 manifest = self._download_xml(
793 manifest_url, video_id, 'Downloading f4m manifest',
794 'Unable to download f4m manifest')
797 manifest_version = '1.0'
798 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
800 manifest_version = '2.0'
801 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
802 for i, media_el in enumerate(media_nodes):
803 if manifest_version == '2.0':
804 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
805 + (media_el.attrib.get('href') or media_el.attrib.get('url')))
806 tbr = int_or_none(media_el.attrib.get('bitrate'))
808 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
812 'width': int_or_none(media_el.attrib.get('width')),
813 'height': int_or_none(media_el.attrib.get('height')),
814 'preference': preference,
816 self._sort_formats(formats)
820 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
821 entry_protocol='m3u8', preference=None,
825 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
830 'resolution': 'multiple',
831 'format_note': 'Quality selection URL',
834 format_url = lambda u: (
836 if re.match(r'^https?://', u)
837 else compat_urlparse.urljoin(m3u8_url, u))
839 m3u8_doc = self._download_webpage(
841 note='Downloading m3u8 information',
842 errnote='Failed to download m3u8 information')
845 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
846 for line in m3u8_doc.splitlines():
847 if line.startswith('#EXT-X-STREAM-INF:'):
849 for m in kv_rex.finditer(line):
851 if v.startswith('"'):
853 last_info[m.group('key')] = v
854 elif line.startswith('#') or not line.strip():
857 if last_info is None:
858 formats.append({'url': format_url(line)})
860 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
862 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
863 'url': format_url(line.strip()),
866 'protocol': entry_protocol,
867 'preference': preference,
869 codecs = last_info.get('CODECS')
871 # TODO: looks like video codec is not always necessarily goes first
872 va_codecs = codecs.split(',')
874 f['vcodec'] = va_codecs[0].partition('.')[0]
875 if len(va_codecs) > 1 and va_codecs[1]:
876 f['acodec'] = va_codecs[1].partition('.')[0]
877 resolution = last_info.get('RESOLUTION')
879 width_str, height_str = resolution.split('x')
880 f['width'] = int(width_str)
881 f['height'] = int(height_str)
884 self._sort_formats(formats)
887 # TODO: improve extraction
888 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
889 smil = self._download_xml(
890 smil_url, video_id, 'Downloading SMIL file',
891 'Unable to download SMIL file', fatal=fatal)
896 base = smil.find('./head/meta').get('base')
900 for video in smil.findall('./body/switch/video'):
901 src = video.get('src')
904 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
905 width = int_or_none(video.get('width'))
906 height = int_or_none(video.get('height'))
907 proto = video.get('proto')
910 if base.startswith('rtmp'):
912 elif base.startswith('http'):
914 ext = video.get('ext')
916 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
917 elif proto == 'rtmp':
919 streamer = video.get('streamer') or base
924 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
929 self._sort_formats(formats)
933 def _live_title(self, name):
934 """ Generate the title for a live video """
935 now = datetime.datetime.now()
936 now_str = now.strftime("%Y-%m-%d %H:%M")
937 return name + ' ' + now_str
939 def _int(self, v, name, fatal=False, **kwargs):
940 res = int_or_none(v, **kwargs)
941 if 'get_attr' in kwargs:
942 print(getattr(v, kwargs['get_attr']))
944 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
946 raise ExtractorError(msg)
948 self._downloader.report_warning(msg)
951 def _float(self, v, name, fatal=False, **kwargs):
952 res = float_or_none(v, **kwargs)
954 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
956 raise ExtractorError(msg)
958 self._downloader.report_warning(msg)
961 def _set_cookie(self, domain, name, value, expire_time=None):
962 cookie = compat_cookiejar.Cookie(
963 0, name, value, None, None, domain, None,
964 None, '/', True, False, expire_time, '', None, None, None)
965 self._downloader.cookiejar.set_cookie(cookie)
967 def get_testcases(self, include_onlymatching=False):
968 t = getattr(self, '_TEST', None)
970 assert not hasattr(self, '_TESTS'), \
971 '%s has _TEST and _TESTS' % type(self).__name__
974 tests = getattr(self, '_TESTS', [])
976 if not include_onlymatching and t.get('only_matching', False):
978 t['name'] = type(self).__name__[:-len('IE')]
981 def is_suitable(self, age_limit):
982 """ Test whether the extractor is generally suitable for the given
983 age limit (i.e. pornographic sites are not, all others usually are) """
985 any_restricted = False
986 for tc in self.get_testcases(include_onlymatching=False):
988 tc = tc['playlist'][0]
989 is_restricted = age_restricted(
990 tc.get('info_dict', {}).get('age_limit'), age_limit)
991 if not is_restricted:
993 any_restricted = any_restricted or is_restricted
994 return not any_restricted
997 class SearchInfoExtractor(InfoExtractor):
999 Base class for paged search queries extractors.
1000 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1001 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1005 def _make_valid_url(cls):
1006 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1009 def suitable(cls, url):
1010 return re.match(cls._make_valid_url(), url) is not None
1012 def _real_extract(self, query):
1013 mobj = re.match(self._make_valid_url(), query)
1015 raise ExtractorError('Invalid search query "%s"' % query)
1017 prefix = mobj.group('prefix')
1018 query = mobj.group('query')
1020 return self._get_n_results(query, 1)
1021 elif prefix == 'all':
1022 return self._get_n_results(query, self._MAX_RESULTS)
1026 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1027 elif n > self._MAX_RESULTS:
1028 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1029 n = self._MAX_RESULTS
1030 return self._get_n_results(query, n)
1032 def _get_n_results(self, query, n):
1033 """Get a specified number of results for a query"""
1034 raise NotImplementedError("This method must be implemented by subclasses")
1037 def SEARCH_KEY(self):
1038 return self._SEARCH_KEY