_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_http_client,
  18     compat_urllib_error,
  19     compat_urllib_parse_urlparse,
  20     compat_urlparse,
  21     compat_str,
  22 )
  23 from ..utils import (
  24     age_restricted,
  25     clean_html,
  26     compiled_regex_type,
  27     ExtractorError,
  28     float_or_none,
  29     int_or_none,
  30     RegexNotFoundError,
  31     sanitize_filename,
  32     unescapeHTML,
  33 )
  34 _NO_DEFAULT = object()
  35
  36
  37 class InfoExtractor(object):
  38     """Information Extractor class.
  39
  40     Information extractors are the classes that, given a URL, extract
  41     information about the video (or videos) the URL refers to. This
  42     information includes the real video URL, the video title, author and
  43     others. The information is stored in a dictionary which is then
  44     passed to the YoutubeDL. The YoutubeDL processes this
  45     information possibly downloading the video to the file system, among
  46     other possible outcomes.
  47
  48     The type field determines the the type of the result.
  49     By far the most common value (and the default if _type is missing) is
  50     "video", which indicates a single video.
  51
  52     For a video, the dictionaries must include the following fields:
  53
  54     id:             Video identifier.
  55     title:          Video title, unescaped.
  56
  57     Additionally, it must contain either a formats entry or a url one:
  58
  59     formats:        A list of dictionaries for each format available, ordered
  60                     from worst to best quality.
  61
  62                     Potential fields:
  63                     * url        Mandatory. The URL of the video file
  64                     * ext        Will be calculated from url if missing
  65                     * format     A human-readable description of the format
  66                                  ("mp4 container with h264/opus").
  67                                  Calculated from the format_id, width, height.
  68                                  and format_note fields if missing.
  69                     * format_id  A short description of the format
  70                                  ("mp4_h264_opus" or "19").
  71                                 Technically optional, but strongly recommended.
  72                     * format_note Additional info about the format
  73                                  ("3D" or "DASH video")
  74                     * width      Width of the video, if known
  75                     * height     Height of the video, if known
  76                     * resolution Textual description of width and height
  77                     * tbr        Average bitrate of audio and video in KBit/s
  78                     * abr        Average audio bitrate in KBit/s
  79                     * acodec     Name of the audio codec in use
  80                     * asr        Audio sampling rate in Hertz
  81                     * vbr        Average video bitrate in KBit/s
  82                     * fps        Frame rate
  83                     * vcodec     Name of the video codec in use
  84                     * container  Name of the container format
  85                     * filesize   The number of bytes, if known in advance
  86                     * filesize_approx  An estimate for the number of bytes
  87                     * player_url SWF Player URL (used for rtmpdump).
  88                     * protocol   The protocol that will be used for the actual
  89                                  download, lower-case.
  90                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  91                     * preference Order number of this format. If this field is
  92                                  present and not None, the formats get sorted
  93                                  by this field, regardless of all other values.
  94                                  -1 for default (order by other properties),
  95                                  -2 or smaller for less than default.
  96                                  < -1000 to hide the format (if there is
  97                                     another one which is strictly better)
  98                     * language_preference  Is this in the correct requested
  99                                  language?
 100                                  10 if it's what the URL is about,
 101                                  -1 for default (don't know),
 102                                  -10 otherwise, other values reserved for now.
 103                     * quality    Order number of the video quality of this
 104                                  format, irrespective of the file format.
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * source_preference  Order number for this video source
 108                                   (quality takes higher priority)
 109                                  -1 for default (order by other properties),
 110                                  -2 or smaller for less than default.
 111                     * http_referer  HTTP Referer header value to set.
 112                     * http_method  HTTP method to use for the download.
 113                     * http_headers  A dictionary of additional HTTP headers
 114                                  to add to the request.
 115                     * http_post_data  Additional data to send with a POST
 116                                  request.
 117     url:            Final video URL.
 118     ext:            Video filename extension.
 119     format:         The video format, defaults to ext (used for --get-format)
 120     player_url:     SWF Player URL (used for rtmpdump).
 121
 122     The following fields are optional:
 123
 124     alt_title:      A secondary title of the video.
 125     display_id      An alternative identifier for the video, not necessarily
 126                     unique, but available before title. Typically, id is
 127                     something like "4234987", title "Dancing naked mole rats",
 128                     and display_id "dancing-naked-mole-rats"
 129     thumbnails:     A list of dictionaries, with the following entries:
 130                         * "url"
 131                         * "width" (optional, int)
 132                         * "height" (optional, int)
 133                         * "resolution" (optional, string "{width}x{height"},
 134                                         deprecated)
 135     thumbnail:      Full URL to a video thumbnail image.
 136     description:    Full video description.
 137     uploader:       Full name of the video uploader.
 138     timestamp:      UNIX timestamp of the moment the video became available.
 139     upload_date:    Video upload date (YYYYMMDD).
 140                     If not explicitly set, calculated from timestamp.
 141     uploader_id:    Nickname or id of the video uploader.
 142     location:       Physical location where the video was filmed.
 143     subtitles:      The subtitle file contents as a dictionary in the format
 144                     {language: subtitles}.
 145     duration:       Length of the video in seconds, as an integer.
 146     view_count:     How many users have watched the video on the platform.
 147     like_count:     Number of positive ratings of the video
 148     dislike_count:  Number of negative ratings of the video
 149     comment_count:  Number of comments on the video
 150     comments:       A list of comments, each with one or more of the following
 151                     properties (all but one of text or html optional):
 152                         * "author" - human-readable name of the comment author
 153                         * "author_id" - user ID of the comment author
 154                         * "id" - Comment ID
 155                         * "html" - Comment as HTML
 156                         * "text" - Plain text of the comment
 157                         * "timestamp" - UNIX timestamp of comment
 158                         * "parent" - ID of the comment this one is replying to.
 159                                      Set to "root" to indicate that this is a
 160                                      comment to the original video.
 161     age_limit:      Age restriction for the video, as an integer (years)
 162     webpage_url:    The url to the video webpage, if given to youtube-dl it
 163                     should allow to get the same result again. (It will be set
 164                     by YoutubeDL if it's missing)
 165     categories:     A list of categories that the video falls in, for example
 166                     ["Sports", "Berlin"]
 167     is_live:        True, False, or None (=unknown). Whether this video is a
 168                     live stream that goes on instead of a fixed-length video.
 169
 170     Unless mentioned otherwise, the fields should be Unicode strings.
 171
 172     Unless mentioned otherwise, None is equivalent to absence of information.
 173
 174
 175     _type "playlist" indicates multiple videos.
 176     There must be a key "entries", which is a list, an iterable, or a PagedList
 177     object, each element of which is a valid dictionary by this specification.
 178
 179     Additionally, playlists can have "title" and "id" attributes with the same
 180     semantics as videos (see above).
 181
 182
 183     _type "multi_video" indicates that there are multiple videos that
 184     form a single show, for examples multiple acts of an opera or TV episode.
 185     It must have an entries key like a playlist and contain all the keys
 186     required for a video at the same time.
 187
 188
 189     _type "url" indicates that the video must be extracted from another
 190     location, possibly by a different extractor. Its only required key is:
 191     "url" - the next URL to extract.
 192     The key "ie_key" can be set to the class name (minus the trailing "IE",
 193     e.g. "Youtube") if the extractor class is known in advance.
 194     Additionally, the dictionary may have any properties of the resolved entity
 195     known in advance, for example "title" if the title of the referred video is
 196     known ahead of time.
 197
 198
 199     _type "url_transparent" entities have the same specification as "url", but
 200     indicate that the given additional information is more precise than the one
 201     associated with the resolved URL.
 202     This is useful when a site employs a video service that hosts the video and
 203     its technical metadata, but that video service does not embed a useful
 204     title, description etc.
 205
 206
 207     Subclasses of this one should re-define the _real_initialize() and
 208     _real_extract() methods and define a _VALID_URL regexp.
 209     Probably, they should also be added to the list of extractors.
 210
 211     Finally, the _WORKING attribute should be set to False for broken IEs
 212     in order to warn the users and skip the tests.
 213     """
 214
 215     _ready = False
 216     _downloader = None
 217     _WORKING = True
 218
 219     def __init__(self, downloader=None):
 220         """Constructor. Receives an optional downloader."""
 221         self._ready = False
 222         self.set_downloader(downloader)
 223
 224     @classmethod
 225     def suitable(cls, url):
 226         """Receives a URL and returns True if suitable for this IE."""
 227
 228         # This does not use has/getattr intentionally - we want to know whether
 229         # we have cached the regexp for *this* class, whereas getattr would also
 230         # match the superclass
 231         if '_VALID_URL_RE' not in cls.__dict__:
 232             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 233         return cls._VALID_URL_RE.match(url) is not None
 234
 235     @classmethod
 236     def _match_id(cls, url):
 237         if '_VALID_URL_RE' not in cls.__dict__:
 238             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 239         m = cls._VALID_URL_RE.match(url)
 240         assert m
 241         return m.group('id')
 242
 243     @classmethod
 244     def working(cls):
 245         """Getter method for _WORKING."""
 246         return cls._WORKING
 247
 248     def initialize(self):
 249         """Initializes an instance (authentication, etc)."""
 250         if not self._ready:
 251             self._real_initialize()
 252             self._ready = True
 253
 254     def extract(self, url):
 255         """Extracts URL information and returns it in list of dicts."""
 256         self.initialize()
 257         return self._real_extract(url)
 258
 259     def set_downloader(self, downloader):
 260         """Sets the downloader for this IE."""
 261         self._downloader = downloader
 262
 263     def _real_initialize(self):
 264         """Real initialization process. Redefine in subclasses."""
 265         pass
 266
 267     def _real_extract(self, url):
 268         """Real extraction process. Redefine in subclasses."""
 269         pass
 270
 271     @classmethod
 272     def ie_key(cls):
 273         """A string for getting the InfoExtractor with get_info_extractor"""
 274         return cls.__name__[:-2]
 275
 276     @property
 277     def IE_NAME(self):
 278         return type(self).__name__[:-2]
 279
 280     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 281         """ Returns the response handle """
 282         if note is None:
 283             self.report_download_webpage(video_id)
 284         elif note is not False:
 285             if video_id is None:
 286                 self.to_screen('%s' % (note,))
 287             else:
 288                 self.to_screen('%s: %s' % (video_id, note))
 289         try:
 290             return self._downloader.urlopen(url_or_request)
 291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 292             if errnote is False:
 293                 return False
 294             if errnote is None:
 295                 errnote = 'Unable to download webpage'
 296             errmsg = '%s: %s' % (errnote, compat_str(err))
 297             if fatal:
 298                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 299             else:
 300                 self._downloader.report_warning(errmsg)
 301                 return False
 302
 303     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 304         """ Returns a tuple (page content as string, URL handle) """
 305         # Strip hashes from the URL (#1038)
 306         if isinstance(url_or_request, (compat_str, str)):
 307             url_or_request = url_or_request.partition('#')[0]
 308
 309         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 310         if urlh is False:
 311             assert not fatal
 312             return False
 313         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 314         return (content, urlh)
 315
 316     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 317         content_type = urlh.headers.get('Content-Type', '')
 318         webpage_bytes = urlh.read()
 319         if prefix is not None:
 320             webpage_bytes = prefix + webpage_bytes
 321         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 322         if m:
 323             encoding = m.group(1)
 324         else:
 325             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 326                           webpage_bytes[:1024])
 327             if m:
 328                 encoding = m.group(1).decode('ascii')
 329             elif webpage_bytes.startswith(b'\xff\xfe'):
 330                 encoding = 'utf-16'
 331             else:
 332                 encoding = 'utf-8'
 333         if self._downloader.params.get('dump_intermediate_pages', False):
 334             try:
 335                 url = url_or_request.get_full_url()
 336             except AttributeError:
 337                 url = url_or_request
 338             self.to_screen('Dumping request to ' + url)
 339             dump = base64.b64encode(webpage_bytes).decode('ascii')
 340             self._downloader.to_screen(dump)
 341         if self._downloader.params.get('write_pages', False):
 342             try:
 343                 url = url_or_request.get_full_url()
 344             except AttributeError:
 345                 url = url_or_request
 346             basen = '%s_%s' % (video_id, url)
 347             if len(basen) > 240:
 348                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 349                 basen = basen[:240 - len(h)] + h
 350             raw_filename = basen + '.dump'
 351             filename = sanitize_filename(raw_filename, restricted=True)
 352             self.to_screen('Saving request to ' + filename)
 353             # Working around MAX_PATH limitation on Windows (see
 354             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 355             if os.name == 'nt':
 356                 absfilepath = os.path.abspath(filename)
 357                 if len(absfilepath) > 259:
 358                     filename = '\\\\?\\' + absfilepath
 359             with open(filename, 'wb') as outf:
 360                 outf.write(webpage_bytes)
 361
 362         try:
 363             content = webpage_bytes.decode(encoding, 'replace')
 364         except LookupError:
 365             content = webpage_bytes.decode('utf-8', 'replace')
 366
 367         if ('<title>Access to this site is blocked</title>' in content and
 368                 'Websense' in content[:512]):
 369             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 370             blocked_iframe = self._html_search_regex(
 371                 r'<iframe src="([^"]+)"', content,
 372                 'Websense information URL', default=None)
 373             if blocked_iframe:
 374                 msg += ' Visit %s for more details' % blocked_iframe
 375             raise ExtractorError(msg, expected=True)
 376
 377         return content
 378
 379     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 380         """ Returns the data of the page as a string """
 381         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 382         if res is False:
 383             return res
 384         else:
 385             content, _ = res
 386             return content
 387
 388     def _download_xml(self, url_or_request, video_id,
 389                       note='Downloading XML', errnote='Unable to download XML',
 390                       transform_source=None, fatal=True):
 391         """Return the xml as an xml.etree.ElementTree.Element"""
 392         xml_string = self._download_webpage(
 393             url_or_request, video_id, note, errnote, fatal=fatal)
 394         if xml_string is False:
 395             return xml_string
 396         if transform_source:
 397             xml_string = transform_source(xml_string)
 398         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 399
 400     def _download_json(self, url_or_request, video_id,
 401                        note='Downloading JSON metadata',
 402                        errnote='Unable to download JSON metadata',
 403                        transform_source=None,
 404                        fatal=True):
 405         json_string = self._download_webpage(
 406             url_or_request, video_id, note, errnote, fatal=fatal)
 407         if (not fatal) and json_string is False:
 408             return None
 409         return self._parse_json(
 410             json_string, video_id, transform_source=transform_source, fatal=fatal)
 411
 412     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 413         if transform_source:
 414             json_string = transform_source(json_string)
 415         try:
 416             return json.loads(json_string)
 417         except ValueError as ve:
 418             errmsg = '%s: Failed to parse JSON ' % video_id
 419             if fatal:
 420                 raise ExtractorError(errmsg, cause=ve)
 421             else:
 422                 self.report_warning(errmsg + str(ve))
 423
 424     def report_warning(self, msg, video_id=None):
 425         idstr = '' if video_id is None else '%s: ' % video_id
 426         self._downloader.report_warning(
 427             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 428
 429     def to_screen(self, msg):
 430         """Print msg to screen, prefixing it with '[ie_name]'"""
 431         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 432
 433     def report_extraction(self, id_or_name):
 434         """Report information extraction."""
 435         self.to_screen('%s: Extracting information' % id_or_name)
 436
 437     def report_download_webpage(self, video_id):
 438         """Report webpage download."""
 439         self.to_screen('%s: Downloading webpage' % video_id)
 440
 441     def report_age_confirmation(self):
 442         """Report attempt to confirm age."""
 443         self.to_screen('Confirming age')
 444
 445     def report_login(self):
 446         """Report attempt to log in."""
 447         self.to_screen('Logging in')
 448
 449     # Methods for following #608
 450     @staticmethod
 451     def url_result(url, ie=None, video_id=None):
 452         """Returns a url that points to a page that should be processed"""
 453         # TODO: ie should be the class used for getting the info
 454         video_info = {'_type': 'url',
 455                       'url': url,
 456                       'ie_key': ie}
 457         if video_id is not None:
 458             video_info['id'] = video_id
 459         return video_info
 460
 461     @staticmethod
 462     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 463         """Returns a playlist"""
 464         video_info = {'_type': 'playlist',
 465                       'entries': entries}
 466         if playlist_id:
 467             video_info['id'] = playlist_id
 468         if playlist_title:
 469             video_info['title'] = playlist_title
 470         if playlist_description:
 471             video_info['description'] = playlist_description
 472         return video_info
 473
 474     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 475         """
 476         Perform a regex search on the given string, using a single or a list of
 477         patterns returning the first matching group.
 478         In case of failure return a default value or raise a WARNING or a
 479         RegexNotFoundError, depending on fatal, specifying the field name.
 480         """
 481         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 482             mobj = re.search(pattern, string, flags)
 483         else:
 484             for p in pattern:
 485                 mobj = re.search(p, string, flags)
 486                 if mobj:
 487                     break
 488
 489         if os.name != 'nt' and sys.stderr.isatty():
 490             _name = '\033[0;34m%s\033[0m' % name
 491         else:
 492             _name = name
 493
 494         if mobj:
 495             if group is None:
 496                 # return the first matching group
 497                 return next(g for g in mobj.groups() if g is not None)
 498             else:
 499                 return mobj.group(group)
 500         elif default is not _NO_DEFAULT:
 501             return default
 502         elif fatal:
 503             raise RegexNotFoundError('Unable to extract %s' % _name)
 504         else:
 505             self._downloader.report_warning('unable to extract %s; '
 506                                             'please report this issue on http://yt-dl.org/bug' % _name)
 507             return None
 508
 509     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 510         """
 511         Like _search_regex, but strips HTML tags and unescapes entities.
 512         """
 513         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 514         if res:
 515             return clean_html(res).strip()
 516         else:
 517             return res
 518
 519     def _get_login_info(self):
 520         """
 521         Get the the login info as (username, password)
 522         It will look in the netrc file using the _NETRC_MACHINE value
 523         If there's no info available, return (None, None)
 524         """
 525         if self._downloader is None:
 526             return (None, None)
 527
 528         username = None
 529         password = None
 530         downloader_params = self._downloader.params
 531
 532         # Attempt to use provided username and password or .netrc data
 533         if downloader_params.get('username', None) is not None:
 534             username = downloader_params['username']
 535             password = downloader_params['password']
 536         elif downloader_params.get('usenetrc', False):
 537             try:
 538                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 539                 if info is not None:
 540                     username = info[0]
 541                     password = info[2]
 542                 else:
 543                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 544             except (IOError, netrc.NetrcParseError) as err:
 545                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 546
 547         return (username, password)
 548
 549     def _get_tfa_info(self):
 550         """
 551         Get the two-factor authentication info
 552         TODO - asking the user will be required for sms/phone verify
 553         currently just uses the command line option
 554         If there's no info available, return None
 555         """
 556         if self._downloader is None:
 557             return None
 558         downloader_params = self._downloader.params
 559
 560         if downloader_params.get('twofactor', None) is not None:
 561             return downloader_params['twofactor']
 562
 563         return None
 564
 565     # Helper functions for extracting OpenGraph info
 566     @staticmethod
 567     def _og_regexes(prop):
 568         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 569         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 570         template = r'<meta[^>]+?%s[^>]+?%s'
 571         return [
 572             template % (property_re, content_re),
 573             template % (content_re, property_re),
 574         ]
 575
 576     def _og_search_property(self, prop, html, name=None, **kargs):
 577         if name is None:
 578             name = 'OpenGraph %s' % prop
 579         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 580         if escaped is None:
 581             return None
 582         return unescapeHTML(escaped)
 583
 584     def _og_search_thumbnail(self, html, **kargs):
 585         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 586
 587     def _og_search_description(self, html, **kargs):
 588         return self._og_search_property('description', html, fatal=False, **kargs)
 589
 590     def _og_search_title(self, html, **kargs):
 591         return self._og_search_property('title', html, **kargs)
 592
 593     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 594         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 595         if secure:
 596             regexes = self._og_regexes('video:secure_url') + regexes
 597         return self._html_search_regex(regexes, html, name, **kargs)
 598
 599     def _og_search_url(self, html, **kargs):
 600         return self._og_search_property('url', html, **kargs)
 601
 602     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 603         if display_name is None:
 604             display_name = name
 605         return self._html_search_regex(
 606             r'''(?isx)<meta
 607                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 608                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 609             html, display_name, fatal=fatal, group='content', **kwargs)
 610
 611     def _dc_search_uploader(self, html):
 612         return self._html_search_meta('dc.creator', html, 'uploader')
 613
 614     def _rta_search(self, html):
 615         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 616         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 617                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 618                      html):
 619             return 18
 620         return 0
 621
 622     def _media_rating_search(self, html):
 623         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 624         rating = self._html_search_meta('rating', html)
 625
 626         if not rating:
 627             return None
 628
 629         RATING_TABLE = {
 630             'safe for kids': 0,
 631             'general': 8,
 632             '14 years': 14,
 633             'mature': 17,
 634             'restricted': 19,
 635         }
 636         return RATING_TABLE.get(rating.lower(), None)
 637
 638     def _twitter_search_player(self, html):
 639         return self._html_search_meta('twitter:player', html,
 640                                       'twitter card player')
 641
 642     def _sort_formats(self, formats):
 643         if not formats:
 644             raise ExtractorError('No video formats found')
 645
 646         def _formats_key(f):
 647             # TODO remove the following workaround
 648             from ..utils import determine_ext
 649             if not f.get('ext') and 'url' in f:
 650                 f['ext'] = determine_ext(f['url'])
 651
 652             preference = f.get('preference')
 653             if preference is None:
 654                 proto = f.get('protocol')
 655                 if proto is None:
 656                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 657
 658                 preference = 0 if proto in ['http', 'https'] else -0.1
 659                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 660                     preference -= 0.5
 661
 662             if f.get('vcodec') == 'none':  # audio only
 663                 if self._downloader.params.get('prefer_free_formats'):
 664                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 665                 else:
 666                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 667                 ext_preference = 0
 668                 try:
 669                     audio_ext_preference = ORDER.index(f['ext'])
 670                 except ValueError:
 671                     audio_ext_preference = -1
 672             else:
 673                 if self._downloader.params.get('prefer_free_formats'):
 674                     ORDER = ['flv', 'mp4', 'webm']
 675                 else:
 676                     ORDER = ['webm', 'flv', 'mp4']
 677                 try:
 678                     ext_preference = ORDER.index(f['ext'])
 679                 except ValueError:
 680                     ext_preference = -1
 681                 audio_ext_preference = 0
 682
 683             return (
 684                 preference,
 685                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 686                 f.get('quality') if f.get('quality') is not None else -1,
 687                 f.get('height') if f.get('height') is not None else -1,
 688                 f.get('width') if f.get('width') is not None else -1,
 689                 ext_preference,
 690                 f.get('tbr') if f.get('tbr') is not None else -1,
 691                 f.get('vbr') if f.get('vbr') is not None else -1,
 692                 f.get('abr') if f.get('abr') is not None else -1,
 693                 audio_ext_preference,
 694                 f.get('fps') if f.get('fps') is not None else -1,
 695                 f.get('filesize') if f.get('filesize') is not None else -1,
 696                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 697                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 698                 f.get('format_id'),
 699             )
 700         formats.sort(key=_formats_key)
 701
 702     def http_scheme(self):
 703         """ Either "http:" or "https:", depending on the user's preferences """
 704         return (
 705             'http:'
 706             if self._downloader.params.get('prefer_insecure', False)
 707             else 'https:')
 708
 709     def _proto_relative_url(self, url, scheme=None):
 710         if url is None:
 711             return url
 712         if url.startswith('//'):
 713             if scheme is None:
 714                 scheme = self.http_scheme()
 715             return scheme + url
 716         else:
 717             return url
 718
 719     def _sleep(self, timeout, video_id, msg_template=None):
 720         if msg_template is None:
 721             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 722         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 723         self.to_screen(msg)
 724         time.sleep(timeout)
 725
 726     def _extract_f4m_formats(self, manifest_url, video_id):
 727         manifest = self._download_xml(
 728             manifest_url, video_id, 'Downloading f4m manifest',
 729             'Unable to download f4m manifest')
 730
 731         formats = []
 732         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 733         for i, media_el in enumerate(media_nodes):
 734             tbr = int_or_none(media_el.attrib.get('bitrate'))
 735             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 736             formats.append({
 737                 'format_id': format_id,
 738                 'url': manifest_url,
 739                 'ext': 'flv',
 740                 'tbr': tbr,
 741                 'width': int_or_none(media_el.attrib.get('width')),
 742                 'height': int_or_none(media_el.attrib.get('height')),
 743             })
 744         self._sort_formats(formats)
 745
 746         return formats
 747
 748     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 749                               entry_protocol='m3u8', preference=None):
 750
 751         formats = [{
 752             'format_id': 'm3u8-meta',
 753             'url': m3u8_url,
 754             'ext': ext,
 755             'protocol': 'm3u8',
 756             'preference': -1,
 757             'resolution': 'multiple',
 758             'format_note': 'Quality selection URL',
 759         }]
 760
 761         format_url = lambda u: (
 762             u
 763             if re.match(r'^https?://', u)
 764             else compat_urlparse.urljoin(m3u8_url, u))
 765
 766         m3u8_doc = self._download_webpage(
 767             m3u8_url, video_id,
 768             note='Downloading m3u8 information',
 769             errnote='Failed to download m3u8 information')
 770         last_info = None
 771         kv_rex = re.compile(
 772             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 773         for line in m3u8_doc.splitlines():
 774             if line.startswith('#EXT-X-STREAM-INF:'):
 775                 last_info = {}
 776                 for m in kv_rex.finditer(line):
 777                     v = m.group('val')
 778                     if v.startswith('"'):
 779                         v = v[1:-1]
 780                     last_info[m.group('key')] = v
 781             elif line.startswith('#') or not line.strip():
 782                 continue
 783             else:
 784                 if last_info is None:
 785                     formats.append({'url': format_url(line)})
 786                     continue
 787                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 788
 789                 f = {
 790                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 791                     'url': format_url(line.strip()),
 792                     'tbr': tbr,
 793                     'ext': ext,
 794                     'protocol': entry_protocol,
 795                     'preference': preference,
 796                 }
 797                 codecs = last_info.get('CODECS')
 798                 if codecs:
 799                     # TODO: looks like video codec is not always necessarily goes first
 800                     va_codecs = codecs.split(',')
 801                     if va_codecs[0]:
 802                         f['vcodec'] = va_codecs[0].partition('.')[0]
 803                     if len(va_codecs) > 1 and va_codecs[1]:
 804                         f['acodec'] = va_codecs[1].partition('.')[0]
 805                 resolution = last_info.get('RESOLUTION')
 806                 if resolution:
 807                     width_str, height_str = resolution.split('x')
 808                     f['width'] = int(width_str)
 809                     f['height'] = int(height_str)
 810                 formats.append(f)
 811                 last_info = {}
 812         self._sort_formats(formats)
 813         return formats
 814
 815     # TODO: improve extraction
 816     def _extract_smil_formats(self, smil_url, video_id):
 817         smil = self._download_xml(
 818             smil_url, video_id, 'Downloading SMIL file',
 819             'Unable to download SMIL file')
 820
 821         base = smil.find('./head/meta').get('base')
 822
 823         formats = []
 824         rtmp_count = 0
 825         for video in smil.findall('./body/switch/video'):
 826             src = video.get('src')
 827             if not src:
 828                 continue
 829             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 830             width = int_or_none(video.get('width'))
 831             height = int_or_none(video.get('height'))
 832             proto = video.get('proto')
 833             if not proto:
 834                 if base:
 835                     if base.startswith('rtmp'):
 836                         proto = 'rtmp'
 837                     elif base.startswith('http'):
 838                         proto = 'http'
 839             ext = video.get('ext')
 840             if proto == 'm3u8':
 841                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 842             elif proto == 'rtmp':
 843                 rtmp_count += 1
 844                 streamer = video.get('streamer') or base
 845                 formats.append({
 846                     'url': streamer,
 847                     'play_path': src,
 848                     'ext': 'flv',
 849                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 850                     'tbr': bitrate,
 851                     'width': width,
 852                     'height': height,
 853                 })
 854         self._sort_formats(formats)
 855
 856         return formats
 857
 858     def _live_title(self, name):
 859         """ Generate the title for a live video """
 860         now = datetime.datetime.now()
 861         now_str = now.strftime("%Y-%m-%d %H:%M")
 862         return name + ' ' + now_str
 863
 864     def _int(self, v, name, fatal=False, **kwargs):
 865         res = int_or_none(v, **kwargs)
 866         if 'get_attr' in kwargs:
 867             print(getattr(v, kwargs['get_attr']))
 868         if res is None:
 869             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 870             if fatal:
 871                 raise ExtractorError(msg)
 872             else:
 873                 self._downloader.report_warning(msg)
 874         return res
 875
 876     def _float(self, v, name, fatal=False, **kwargs):
 877         res = float_or_none(v, **kwargs)
 878         if res is None:
 879             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 880             if fatal:
 881                 raise ExtractorError(msg)
 882             else:
 883                 self._downloader.report_warning(msg)
 884         return res
 885
 886     def _set_cookie(self, domain, name, value, expire_time=None):
 887         cookie = compat_cookiejar.Cookie(
 888             0, name, value, None, None, domain, None,
 889             None, '/', True, False, expire_time, '', None, None, None)
 890         self._downloader.cookiejar.set_cookie(cookie)
 891
 892     def get_testcases(self, include_onlymatching=False):
 893         t = getattr(self, '_TEST', None)
 894         if t:
 895             assert not hasattr(self, '_TESTS'), \
 896                 '%s has _TEST and _TESTS' % type(self).__name__
 897             tests = [t]
 898         else:
 899             tests = getattr(self, '_TESTS', [])
 900         for t in tests:
 901             if not include_onlymatching and t.get('only_matching', False):
 902                 continue
 903             t['name'] = type(self).__name__[:-len('IE')]
 904             yield t
 905
 906     def is_suitable(self, age_limit):
 907         """ Test whether the extractor is generally suitable for the given
 908         age limit (i.e. pornographic sites are not, all others usually are) """
 909
 910         any_restricted = False
 911         for tc in self.get_testcases(include_onlymatching=False):
 912             if 'playlist' in tc:
 913                 tc = tc['playlist'][0]
 914             is_restricted = age_restricted(
 915                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 916             if not is_restricted:
 917                 return True
 918             any_restricted = any_restricted or is_restricted
 919         return not any_restricted
 920
 921
 922 class SearchInfoExtractor(InfoExtractor):
 923     """
 924     Base class for paged search queries extractors.
 925     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 926     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 927     """
 928
 929     @classmethod
 930     def _make_valid_url(cls):
 931         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 932
 933     @classmethod
 934     def suitable(cls, url):
 935         return re.match(cls._make_valid_url(), url) is not None
 936
 937     def _real_extract(self, query):
 938         mobj = re.match(self._make_valid_url(), query)
 939         if mobj is None:
 940             raise ExtractorError('Invalid search query "%s"' % query)
 941
 942         prefix = mobj.group('prefix')
 943         query = mobj.group('query')
 944         if prefix == '':
 945             return self._get_n_results(query, 1)
 946         elif prefix == 'all':
 947             return self._get_n_results(query, self._MAX_RESULTS)
 948         else:
 949             n = int(prefix)
 950             if n <= 0:
 951                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 952             elif n > self._MAX_RESULTS:
 953                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 954                 n = self._MAX_RESULTS
 955             return self._get_n_results(query, n)
 956
 957     def _get_n_results(self, query, n):
 958         """Get a specified number of results for a query"""
 959         raise NotImplementedError("This method must be implemented by subclasses")
 960
 961     @property
 962     def SEARCH_KEY(self):
 963         return self._SEARCH_KEY