_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_HTTPError,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse_urlparse,
  21     compat_urlparse,
  22     compat_str,
  23 )
  24 from ..utils import (
  25     age_restricted,
  26     clean_html,
  27     compiled_regex_type,
  28     ExtractorError,
  29     float_or_none,
  30     HEADRequest,
  31     int_or_none,
  32     RegexNotFoundError,
  33     sanitize_filename,
  34     unescapeHTML,
  35 )
  36 _NO_DEFAULT = object()
  37
  38
  39 class InfoExtractor(object):
  40     """Information Extractor class.
  41
  42     Information extractors are the classes that, given a URL, extract
  43     information about the video (or videos) the URL refers to. This
  44     information includes the real video URL, the video title, author and
  45     others. The information is stored in a dictionary which is then
  46     passed to the YoutubeDL. The YoutubeDL processes this
  47     information possibly downloading the video to the file system, among
  48     other possible outcomes.
  49
  50     The type field determines the the type of the result.
  51     By far the most common value (and the default if _type is missing) is
  52     "video", which indicates a single video.
  53
  54     For a video, the dictionaries must include the following fields:
  55
  56     id:             Video identifier.
  57     title:          Video title, unescaped.
  58
  59     Additionally, it must contain either a formats entry or a url one:
  60
  61     formats:        A list of dictionaries for each format available, ordered
  62                     from worst to best quality.
  63
  64                     Potential fields:
  65                     * url        Mandatory. The URL of the video file
  66                     * ext        Will be calculated from url if missing
  67                     * format     A human-readable description of the format
  68                                  ("mp4 container with h264/opus").
  69                                  Calculated from the format_id, width, height.
  70                                  and format_note fields if missing.
  71                     * format_id  A short description of the format
  72                                  ("mp4_h264_opus" or "19").
  73                                 Technically optional, but strongly recommended.
  74                     * format_note Additional info about the format
  75                                  ("3D" or "DASH video")
  76                     * width      Width of the video, if known
  77                     * height     Height of the video, if known
  78                     * resolution Textual description of width and height
  79                     * tbr        Average bitrate of audio and video in KBit/s
  80                     * abr        Average audio bitrate in KBit/s
  81                     * acodec     Name of the audio codec in use
  82                     * asr        Audio sampling rate in Hertz
  83                     * vbr        Average video bitrate in KBit/s
  84                     * fps        Frame rate
  85                     * vcodec     Name of the video codec in use
  86                     * container  Name of the container format
  87                     * filesize   The number of bytes, if known in advance
  88                     * filesize_approx  An estimate for the number of bytes
  89                     * player_url SWF Player URL (used for rtmpdump).
  90                     * protocol   The protocol that will be used for the actual
  91                                  download, lower-case.
  92                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  93                                  "m3u8", or "m3u8_native".
  94                     * preference Order number of this format. If this field is
  95                                  present and not None, the formats get sorted
  96                                  by this field, regardless of all other values.
  97                                  -1 for default (order by other properties),
  98                                  -2 or smaller for less than default.
  99                                  < -1000 to hide the format (if there is
 100                                     another one which is strictly better)
 101                     * language_preference  Is this in the correct requested
 102                                  language?
 103                                  10 if it's what the URL is about,
 104                                  -1 for default (don't know),
 105                                  -10 otherwise, other values reserved for now.
 106                     * quality    Order number of the video quality of this
 107                                  format, irrespective of the file format.
 108                                  -1 for default (order by other properties),
 109                                  -2 or smaller for less than default.
 110                     * source_preference  Order number for this video source
 111                                   (quality takes higher priority)
 112                                  -1 for default (order by other properties),
 113                                  -2 or smaller for less than default.
 114                     * http_method  HTTP method to use for the download.
 115                     * http_headers  A dictionary of additional HTTP headers
 116                                  to add to the request.
 117                     * http_post_data  Additional data to send with a POST
 118                                  request.
 119                     * stretched_ratio  If given and not 1, indicates that the
 120                                  video's pixels are not square.
 121                                  width : height ratio as float.
 122                     * no_resume  The server does not support resuming the
 123                                  (HTTP or RTMP) download. Boolean.
 124
 125     url:            Final video URL.
 126     ext:            Video filename extension.
 127     format:         The video format, defaults to ext (used for --get-format)
 128     player_url:     SWF Player URL (used for rtmpdump).
 129
 130     The following fields are optional:
 131
 132     alt_title:      A secondary title of the video.
 133     display_id      An alternative identifier for the video, not necessarily
 134                     unique, but available before title. Typically, id is
 135                     something like "4234987", title "Dancing naked mole rats",
 136                     and display_id "dancing-naked-mole-rats"
 137     thumbnails:     A list of dictionaries, with the following entries:
 138                         * "id" (optional, string) - Thumbnail format ID
 139                         * "url"
 140                         * "preference" (optional, int) - quality of the image
 141                         * "width" (optional, int)
 142                         * "height" (optional, int)
 143                         * "resolution" (optional, string "{width}x{height"},
 144                                         deprecated)
 145     thumbnail:      Full URL to a video thumbnail image.
 146     description:    Full video description.
 147     uploader:       Full name of the video uploader.
 148     creator:        The main artist who created the video.
 149     timestamp:      UNIX timestamp of the moment the video became available.
 150     upload_date:    Video upload date (YYYYMMDD).
 151                     If not explicitly set, calculated from timestamp.
 152     uploader_id:    Nickname or id of the video uploader.
 153     location:       Physical location where the video was filmed.
 154     subtitles:      The subtitle file contents as a dictionary in the format
 155                     {language: subtitles}.
 156     duration:       Length of the video in seconds, as an integer.
 157     view_count:     How many users have watched the video on the platform.
 158     like_count:     Number of positive ratings of the video
 159     dislike_count:  Number of negative ratings of the video
 160     comment_count:  Number of comments on the video
 161     comments:       A list of comments, each with one or more of the following
 162                     properties (all but one of text or html optional):
 163                         * "author" - human-readable name of the comment author
 164                         * "author_id" - user ID of the comment author
 165                         * "id" - Comment ID
 166                         * "html" - Comment as HTML
 167                         * "text" - Plain text of the comment
 168                         * "timestamp" - UNIX timestamp of comment
 169                         * "parent" - ID of the comment this one is replying to.
 170                                      Set to "root" to indicate that this is a
 171                                      comment to the original video.
 172     age_limit:      Age restriction for the video, as an integer (years)
 173     webpage_url:    The url to the video webpage, if given to youtube-dl it
 174                     should allow to get the same result again. (It will be set
 175                     by YoutubeDL if it's missing)
 176     categories:     A list of categories that the video falls in, for example
 177                     ["Sports", "Berlin"]
 178     is_live:        True, False, or None (=unknown). Whether this video is a
 179                     live stream that goes on instead of a fixed-length video.
 180
 181     Unless mentioned otherwise, the fields should be Unicode strings.
 182
 183     Unless mentioned otherwise, None is equivalent to absence of information.
 184
 185
 186     _type "playlist" indicates multiple videos.
 187     There must be a key "entries", which is a list, an iterable, or a PagedList
 188     object, each element of which is a valid dictionary by this specification.
 189
 190     Additionally, playlists can have "title" and "id" attributes with the same
 191     semantics as videos (see above).
 192
 193
 194     _type "multi_video" indicates that there are multiple videos that
 195     form a single show, for examples multiple acts of an opera or TV episode.
 196     It must have an entries key like a playlist and contain all the keys
 197     required for a video at the same time.
 198
 199
 200     _type "url" indicates that the video must be extracted from another
 201     location, possibly by a different extractor. Its only required key is:
 202     "url" - the next URL to extract.
 203     The key "ie_key" can be set to the class name (minus the trailing "IE",
 204     e.g. "Youtube") if the extractor class is known in advance.
 205     Additionally, the dictionary may have any properties of the resolved entity
 206     known in advance, for example "title" if the title of the referred video is
 207     known ahead of time.
 208
 209
 210     _type "url_transparent" entities have the same specification as "url", but
 211     indicate that the given additional information is more precise than the one
 212     associated with the resolved URL.
 213     This is useful when a site employs a video service that hosts the video and
 214     its technical metadata, but that video service does not embed a useful
 215     title, description etc.
 216
 217
 218     Subclasses of this one should re-define the _real_initialize() and
 219     _real_extract() methods and define a _VALID_URL regexp.
 220     Probably, they should also be added to the list of extractors.
 221
 222     Finally, the _WORKING attribute should be set to False for broken IEs
 223     in order to warn the users and skip the tests.
 224     """
 225
 226     _ready = False
 227     _downloader = None
 228     _WORKING = True
 229
 230     def __init__(self, downloader=None):
 231         """Constructor. Receives an optional downloader."""
 232         self._ready = False
 233         self.set_downloader(downloader)
 234
 235     @classmethod
 236     def suitable(cls, url):
 237         """Receives a URL and returns True if suitable for this IE."""
 238
 239         # This does not use has/getattr intentionally - we want to know whether
 240         # we have cached the regexp for *this* class, whereas getattr would also
 241         # match the superclass
 242         if '_VALID_URL_RE' not in cls.__dict__:
 243             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 244         return cls._VALID_URL_RE.match(url) is not None
 245
 246     @classmethod
 247     def _match_id(cls, url):
 248         if '_VALID_URL_RE' not in cls.__dict__:
 249             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 250         m = cls._VALID_URL_RE.match(url)
 251         assert m
 252         return m.group('id')
 253
 254     @classmethod
 255     def working(cls):
 256         """Getter method for _WORKING."""
 257         return cls._WORKING
 258
 259     def initialize(self):
 260         """Initializes an instance (authentication, etc)."""
 261         if not self._ready:
 262             self._real_initialize()
 263             self._ready = True
 264
 265     def extract(self, url):
 266         """Extracts URL information and returns it in list of dicts."""
 267         self.initialize()
 268         return self._real_extract(url)
 269
 270     def set_downloader(self, downloader):
 271         """Sets the downloader for this IE."""
 272         self._downloader = downloader
 273
 274     def _real_initialize(self):
 275         """Real initialization process. Redefine in subclasses."""
 276         pass
 277
 278     def _real_extract(self, url):
 279         """Real extraction process. Redefine in subclasses."""
 280         pass
 281
 282     @classmethod
 283     def ie_key(cls):
 284         """A string for getting the InfoExtractor with get_info_extractor"""
 285         return cls.__name__[:-2]
 286
 287     @property
 288     def IE_NAME(self):
 289         return type(self).__name__[:-2]
 290
 291     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 292         """ Returns the response handle """
 293         if note is None:
 294             self.report_download_webpage(video_id)
 295         elif note is not False:
 296             if video_id is None:
 297                 self.to_screen('%s' % (note,))
 298             else:
 299                 self.to_screen('%s: %s' % (video_id, note))
 300         try:
 301             return self._downloader.urlopen(url_or_request)
 302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 303             if errnote is False:
 304                 return False
 305             if errnote is None:
 306                 errnote = 'Unable to download webpage'
 307             errmsg = '%s: %s' % (errnote, compat_str(err))
 308             if fatal:
 309                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 310             else:
 311                 self._downloader.report_warning(errmsg)
 312                 return False
 313
 314     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 315         """ Returns a tuple (page content as string, URL handle) """
 316         # Strip hashes from the URL (#1038)
 317         if isinstance(url_or_request, (compat_str, str)):
 318             url_or_request = url_or_request.partition('#')[0]
 319
 320         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 321         if urlh is False:
 322             assert not fatal
 323             return False
 324         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 325         return (content, urlh)
 326
 327     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 328         content_type = urlh.headers.get('Content-Type', '')
 329         webpage_bytes = urlh.read()
 330         if prefix is not None:
 331             webpage_bytes = prefix + webpage_bytes
 332         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 333         if m:
 334             encoding = m.group(1)
 335         else:
 336             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 337                           webpage_bytes[:1024])
 338             if m:
 339                 encoding = m.group(1).decode('ascii')
 340             elif webpage_bytes.startswith(b'\xff\xfe'):
 341                 encoding = 'utf-16'
 342             else:
 343                 encoding = 'utf-8'
 344         if self._downloader.params.get('dump_intermediate_pages', False):
 345             try:
 346                 url = url_or_request.get_full_url()
 347             except AttributeError:
 348                 url = url_or_request
 349             self.to_screen('Dumping request to ' + url)
 350             dump = base64.b64encode(webpage_bytes).decode('ascii')
 351             self._downloader.to_screen(dump)
 352         if self._downloader.params.get('write_pages', False):
 353             try:
 354                 url = url_or_request.get_full_url()
 355             except AttributeError:
 356                 url = url_or_request
 357             basen = '%s_%s' % (video_id, url)
 358             if len(basen) > 240:
 359                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 360                 basen = basen[:240 - len(h)] + h
 361             raw_filename = basen + '.dump'
 362             filename = sanitize_filename(raw_filename, restricted=True)
 363             self.to_screen('Saving request to ' + filename)
 364             # Working around MAX_PATH limitation on Windows (see
 365             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 366             if os.name == 'nt':
 367                 absfilepath = os.path.abspath(filename)
 368                 if len(absfilepath) > 259:
 369                     filename = '\\\\?\\' + absfilepath
 370             with open(filename, 'wb') as outf:
 371                 outf.write(webpage_bytes)
 372
 373         try:
 374             content = webpage_bytes.decode(encoding, 'replace')
 375         except LookupError:
 376             content = webpage_bytes.decode('utf-8', 'replace')
 377
 378         if ('<title>Access to this site is blocked</title>' in content and
 379                 'Websense' in content[:512]):
 380             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 381             blocked_iframe = self._html_search_regex(
 382                 r'<iframe src="([^"]+)"', content,
 383                 'Websense information URL', default=None)
 384             if blocked_iframe:
 385                 msg += ' Visit %s for more details' % blocked_iframe
 386             raise ExtractorError(msg, expected=True)
 387
 388         return content
 389
 390     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
 391         """ Returns the data of the page as a string """
 392         success = False
 393         try_count = 0
 394         while success is False:
 395             try:
 396                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 397                 success = True
 398             except compat_http_client.IncompleteRead as e:
 399                 try_count += 1
 400                 if try_count >= tries:
 401                     raise e
 402                 self._sleep(timeout, video_id)
 403         if res is False:
 404             return res
 405         else:
 406             content, _ = res
 407             return content
 408
 409     def _download_xml(self, url_or_request, video_id,
 410                       note='Downloading XML', errnote='Unable to download XML',
 411                       transform_source=None, fatal=True):
 412         """Return the xml as an xml.etree.ElementTree.Element"""
 413         xml_string = self._download_webpage(
 414             url_or_request, video_id, note, errnote, fatal=fatal)
 415         if xml_string is False:
 416             return xml_string
 417         if transform_source:
 418             xml_string = transform_source(xml_string)
 419         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 420
 421     def _download_json(self, url_or_request, video_id,
 422                        note='Downloading JSON metadata',
 423                        errnote='Unable to download JSON metadata',
 424                        transform_source=None,
 425                        fatal=True):
 426         json_string = self._download_webpage(
 427             url_or_request, video_id, note, errnote, fatal=fatal)
 428         if (not fatal) and json_string is False:
 429             return None
 430         return self._parse_json(
 431             json_string, video_id, transform_source=transform_source, fatal=fatal)
 432
 433     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 434         if transform_source:
 435             json_string = transform_source(json_string)
 436         try:
 437             return json.loads(json_string)
 438         except ValueError as ve:
 439             errmsg = '%s: Failed to parse JSON ' % video_id
 440             if fatal:
 441                 raise ExtractorError(errmsg, cause=ve)
 442             else:
 443                 self.report_warning(errmsg + str(ve))
 444
 445     def report_warning(self, msg, video_id=None):
 446         idstr = '' if video_id is None else '%s: ' % video_id
 447         self._downloader.report_warning(
 448             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 449
 450     def to_screen(self, msg):
 451         """Print msg to screen, prefixing it with '[ie_name]'"""
 452         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 453
 454     def report_extraction(self, id_or_name):
 455         """Report information extraction."""
 456         self.to_screen('%s: Extracting information' % id_or_name)
 457
 458     def report_download_webpage(self, video_id):
 459         """Report webpage download."""
 460         self.to_screen('%s: Downloading webpage' % video_id)
 461
 462     def report_age_confirmation(self):
 463         """Report attempt to confirm age."""
 464         self.to_screen('Confirming age')
 465
 466     def report_login(self):
 467         """Report attempt to log in."""
 468         self.to_screen('Logging in')
 469
 470     # Methods for following #608
 471     @staticmethod
 472     def url_result(url, ie=None, video_id=None):
 473         """Returns a url that points to a page that should be processed"""
 474         # TODO: ie should be the class used for getting the info
 475         video_info = {'_type': 'url',
 476                       'url': url,
 477                       'ie_key': ie}
 478         if video_id is not None:
 479             video_info['id'] = video_id
 480         return video_info
 481
 482     @staticmethod
 483     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 484         """Returns a playlist"""
 485         video_info = {'_type': 'playlist',
 486                       'entries': entries}
 487         if playlist_id:
 488             video_info['id'] = playlist_id
 489         if playlist_title:
 490             video_info['title'] = playlist_title
 491         if playlist_description:
 492             video_info['description'] = playlist_description
 493         return video_info
 494
 495     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 496         """
 497         Perform a regex search on the given string, using a single or a list of
 498         patterns returning the first matching group.
 499         In case of failure return a default value or raise a WARNING or a
 500         RegexNotFoundError, depending on fatal, specifying the field name.
 501         """
 502         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 503             mobj = re.search(pattern, string, flags)
 504         else:
 505             for p in pattern:
 506                 mobj = re.search(p, string, flags)
 507                 if mobj:
 508                     break
 509
 510         if os.name != 'nt' and sys.stderr.isatty():
 511             _name = '\033[0;34m%s\033[0m' % name
 512         else:
 513             _name = name
 514
 515         if mobj:
 516             if group is None:
 517                 # return the first matching group
 518                 return next(g for g in mobj.groups() if g is not None)
 519             else:
 520                 return mobj.group(group)
 521         elif default is not _NO_DEFAULT:
 522             return default
 523         elif fatal:
 524             raise RegexNotFoundError('Unable to extract %s' % _name)
 525         else:
 526             self._downloader.report_warning('unable to extract %s; '
 527                                             'please report this issue on http://yt-dl.org/bug' % _name)
 528             return None
 529
 530     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 531         """
 532         Like _search_regex, but strips HTML tags and unescapes entities.
 533         """
 534         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 535         if res:
 536             return clean_html(res).strip()
 537         else:
 538             return res
 539
 540     def _get_login_info(self):
 541         """
 542         Get the the login info as (username, password)
 543         It will look in the netrc file using the _NETRC_MACHINE value
 544         If there's no info available, return (None, None)
 545         """
 546         if self._downloader is None:
 547             return (None, None)
 548
 549         username = None
 550         password = None
 551         downloader_params = self._downloader.params
 552
 553         # Attempt to use provided username and password or .netrc data
 554         if downloader_params.get('username', None) is not None:
 555             username = downloader_params['username']
 556             password = downloader_params['password']
 557         elif downloader_params.get('usenetrc', False):
 558             try:
 559                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 560                 if info is not None:
 561                     username = info[0]
 562                     password = info[2]
 563                 else:
 564                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 565             except (IOError, netrc.NetrcParseError) as err:
 566                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 567
 568         return (username, password)
 569
 570     def _get_tfa_info(self):
 571         """
 572         Get the two-factor authentication info
 573         TODO - asking the user will be required for sms/phone verify
 574         currently just uses the command line option
 575         If there's no info available, return None
 576         """
 577         if self._downloader is None:
 578             return None
 579         downloader_params = self._downloader.params
 580
 581         if downloader_params.get('twofactor', None) is not None:
 582             return downloader_params['twofactor']
 583
 584         return None
 585
 586     # Helper functions for extracting OpenGraph info
 587     @staticmethod
 588     def _og_regexes(prop):
 589         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 590         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 591         template = r'<meta[^>]+?%s[^>]+?%s'
 592         return [
 593             template % (property_re, content_re),
 594             template % (content_re, property_re),
 595         ]
 596
 597     def _og_search_property(self, prop, html, name=None, **kargs):
 598         if name is None:
 599             name = 'OpenGraph %s' % prop
 600         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 601         if escaped is None:
 602             return None
 603         return unescapeHTML(escaped)
 604
 605     def _og_search_thumbnail(self, html, **kargs):
 606         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 607
 608     def _og_search_description(self, html, **kargs):
 609         return self._og_search_property('description', html, fatal=False, **kargs)
 610
 611     def _og_search_title(self, html, **kargs):
 612         return self._og_search_property('title', html, **kargs)
 613
 614     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 615         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 616         if secure:
 617             regexes = self._og_regexes('video:secure_url') + regexes
 618         return self._html_search_regex(regexes, html, name, **kargs)
 619
 620     def _og_search_url(self, html, **kargs):
 621         return self._og_search_property('url', html, **kargs)
 622
 623     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 624         if display_name is None:
 625             display_name = name
 626         return self._html_search_regex(
 627             r'''(?isx)<meta
 628                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 629                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 630             html, display_name, fatal=fatal, group='content', **kwargs)
 631
 632     def _dc_search_uploader(self, html):
 633         return self._html_search_meta('dc.creator', html, 'uploader')
 634
 635     def _rta_search(self, html):
 636         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 637         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 638                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 639                      html):
 640             return 18
 641         return 0
 642
 643     def _media_rating_search(self, html):
 644         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 645         rating = self._html_search_meta('rating', html)
 646
 647         if not rating:
 648             return None
 649
 650         RATING_TABLE = {
 651             'safe for kids': 0,
 652             'general': 8,
 653             '14 years': 14,
 654             'mature': 17,
 655             'restricted': 19,
 656         }
 657         return RATING_TABLE.get(rating.lower(), None)
 658
 659     def _family_friendly_search(self, html):
 660         # See http://schema.org/VideoObj
 661         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 662
 663         if not family_friendly:
 664             return None
 665
 666         RATING_TABLE = {
 667             '1': 0,
 668             'true': 0,
 669             '0': 18,
 670             'false': 18,
 671         }
 672         return RATING_TABLE.get(family_friendly.lower(), None)
 673
 674     def _twitter_search_player(self, html):
 675         return self._html_search_meta('twitter:player', html,
 676                                       'twitter card player')
 677
 678     def _sort_formats(self, formats):
 679         if not formats:
 680             raise ExtractorError('No video formats found')
 681
 682         def _formats_key(f):
 683             # TODO remove the following workaround
 684             from ..utils import determine_ext
 685             if not f.get('ext') and 'url' in f:
 686                 f['ext'] = determine_ext(f['url'])
 687
 688             preference = f.get('preference')
 689             if preference is None:
 690                 proto = f.get('protocol')
 691                 if proto is None:
 692                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 693
 694                 preference = 0 if proto in ['http', 'https'] else -0.1
 695                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 696                     preference -= 0.5
 697
 698             if f.get('vcodec') == 'none':  # audio only
 699                 if self._downloader.params.get('prefer_free_formats'):
 700                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 701                 else:
 702                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 703                 ext_preference = 0
 704                 try:
 705                     audio_ext_preference = ORDER.index(f['ext'])
 706                 except ValueError:
 707                     audio_ext_preference = -1
 708             else:
 709                 if self._downloader.params.get('prefer_free_formats'):
 710                     ORDER = ['flv', 'mp4', 'webm']
 711                 else:
 712                     ORDER = ['webm', 'flv', 'mp4']
 713                 try:
 714                     ext_preference = ORDER.index(f['ext'])
 715                 except ValueError:
 716                     ext_preference = -1
 717                 audio_ext_preference = 0
 718
 719             return (
 720                 preference,
 721                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 722                 f.get('quality') if f.get('quality') is not None else -1,
 723                 f.get('tbr') if f.get('tbr') is not None else -1,
 724                 f.get('vbr') if f.get('vbr') is not None else -1,
 725                 f.get('height') if f.get('height') is not None else -1,
 726                 f.get('width') if f.get('width') is not None else -1,
 727                 ext_preference,
 728                 f.get('abr') if f.get('abr') is not None else -1,
 729                 audio_ext_preference,
 730                 f.get('fps') if f.get('fps') is not None else -1,
 731                 f.get('filesize') if f.get('filesize') is not None else -1,
 732                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 733                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 734                 f.get('format_id'),
 735             )
 736         formats.sort(key=_formats_key)
 737
 738     def _check_formats(self, formats, video_id):
 739         if formats:
 740             formats[:] = filter(
 741                 lambda f: self._is_valid_url(
 742                     f['url'], video_id,
 743                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 744                 formats)
 745
 746     def _is_valid_url(self, url, video_id, item='video'):
 747         try:
 748             self._request_webpage(
 749                 HEADRequest(url), video_id,
 750                 'Checking %s URL' % item)
 751             return True
 752         except ExtractorError as e:
 753             if isinstance(e.cause, compat_HTTPError):
 754                 self.report_warning(
 755                     '%s URL is invalid, skipping' % item, video_id)
 756                 return False
 757             raise
 758
 759     def http_scheme(self):
 760         """ Either "http:" or "https:", depending on the user's preferences """
 761         return (
 762             'http:'
 763             if self._downloader.params.get('prefer_insecure', False)
 764             else 'https:')
 765
 766     def _proto_relative_url(self, url, scheme=None):
 767         if url is None:
 768             return url
 769         if url.startswith('//'):
 770             if scheme is None:
 771                 scheme = self.http_scheme()
 772             return scheme + url
 773         else:
 774             return url
 775
 776     def _sleep(self, timeout, video_id, msg_template=None):
 777         if msg_template is None:
 778             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 779         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 780         self.to_screen(msg)
 781         time.sleep(timeout)
 782
 783     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
 784         manifest = self._download_xml(
 785             manifest_url, video_id, 'Downloading f4m manifest',
 786             'Unable to download f4m manifest')
 787
 788         formats = []
 789         manifest_version = '1.0'
 790         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 791         if not media_nodes:
 792             manifest_version = '2.0'
 793             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 794         for i, media_el in enumerate(media_nodes):
 795             if manifest_version == '2.0':
 796                 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
 797                                 + (media_el.attrib.get('href') or media_el.attrib.get('url')))
 798             tbr = int_or_none(media_el.attrib.get('bitrate'))
 799             formats.append({
 800                 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
 801                 'url': manifest_url,
 802                 'ext': 'flv',
 803                 'tbr': tbr,
 804                 'width': int_or_none(media_el.attrib.get('width')),
 805                 'height': int_or_none(media_el.attrib.get('height')),
 806                 'preference': preference,
 807             })
 808         self._sort_formats(formats)
 809
 810         return formats
 811
 812     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 813                               entry_protocol='m3u8', preference=None,
 814                               m3u8_id=None):
 815
 816         formats = [{
 817             'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
 818             'url': m3u8_url,
 819             'ext': ext,
 820             'protocol': 'm3u8',
 821             'preference': -1,
 822             'resolution': 'multiple',
 823             'format_note': 'Quality selection URL',
 824         }]
 825
 826         format_url = lambda u: (
 827             u
 828             if re.match(r'^https?://', u)
 829             else compat_urlparse.urljoin(m3u8_url, u))
 830
 831         m3u8_doc = self._download_webpage(
 832             m3u8_url, video_id,
 833             note='Downloading m3u8 information',
 834             errnote='Failed to download m3u8 information')
 835         last_info = None
 836         kv_rex = re.compile(
 837             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 838         for line in m3u8_doc.splitlines():
 839             if line.startswith('#EXT-X-STREAM-INF:'):
 840                 last_info = {}
 841                 for m in kv_rex.finditer(line):
 842                     v = m.group('val')
 843                     if v.startswith('"'):
 844                         v = v[1:-1]
 845                     last_info[m.group('key')] = v
 846             elif line.startswith('#') or not line.strip():
 847                 continue
 848             else:
 849                 if last_info is None:
 850                     formats.append({'url': format_url(line)})
 851                     continue
 852                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 853                 f = {
 854                     'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
 855                     'url': format_url(line.strip()),
 856                     'tbr': tbr,
 857                     'ext': ext,
 858                     'protocol': entry_protocol,
 859                     'preference': preference,
 860                 }
 861                 codecs = last_info.get('CODECS')
 862                 if codecs:
 863                     # TODO: looks like video codec is not always necessarily goes first
 864                     va_codecs = codecs.split(',')
 865                     if va_codecs[0]:
 866                         f['vcodec'] = va_codecs[0].partition('.')[0]
 867                     if len(va_codecs) > 1 and va_codecs[1]:
 868                         f['acodec'] = va_codecs[1].partition('.')[0]
 869                 resolution = last_info.get('RESOLUTION')
 870                 if resolution:
 871                     width_str, height_str = resolution.split('x')
 872                     f['width'] = int(width_str)
 873                     f['height'] = int(height_str)
 874                 formats.append(f)
 875                 last_info = {}
 876         self._sort_formats(formats)
 877         return formats
 878
 879     # TODO: improve extraction
 880     def _extract_smil_formats(self, smil_url, video_id, fatal=True):
 881         smil = self._download_xml(
 882             smil_url, video_id, 'Downloading SMIL file',
 883             'Unable to download SMIL file', fatal=fatal)
 884         if smil is False:
 885             assert not fatal
 886             return []
 887
 888         base = smil.find('./head/meta').get('base')
 889
 890         formats = []
 891         rtmp_count = 0
 892         for video in smil.findall('./body/switch/video'):
 893             src = video.get('src')
 894             if not src:
 895                 continue
 896             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 897             width = int_or_none(video.get('width'))
 898             height = int_or_none(video.get('height'))
 899             proto = video.get('proto')
 900             if not proto:
 901                 if base:
 902                     if base.startswith('rtmp'):
 903                         proto = 'rtmp'
 904                     elif base.startswith('http'):
 905                         proto = 'http'
 906             ext = video.get('ext')
 907             if proto == 'm3u8':
 908                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 909             elif proto == 'rtmp':
 910                 rtmp_count += 1
 911                 streamer = video.get('streamer') or base
 912                 formats.append({
 913                     'url': streamer,
 914                     'play_path': src,
 915                     'ext': 'flv',
 916                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 917                     'tbr': bitrate,
 918                     'width': width,
 919                     'height': height,
 920                 })
 921         self._sort_formats(formats)
 922
 923         return formats
 924
 925     def _live_title(self, name):
 926         """ Generate the title for a live video """
 927         now = datetime.datetime.now()
 928         now_str = now.strftime("%Y-%m-%d %H:%M")
 929         return name + ' ' + now_str
 930
 931     def _int(self, v, name, fatal=False, **kwargs):
 932         res = int_or_none(v, **kwargs)
 933         if 'get_attr' in kwargs:
 934             print(getattr(v, kwargs['get_attr']))
 935         if res is None:
 936             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 937             if fatal:
 938                 raise ExtractorError(msg)
 939             else:
 940                 self._downloader.report_warning(msg)
 941         return res
 942
 943     def _float(self, v, name, fatal=False, **kwargs):
 944         res = float_or_none(v, **kwargs)
 945         if res is None:
 946             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 947             if fatal:
 948                 raise ExtractorError(msg)
 949             else:
 950                 self._downloader.report_warning(msg)
 951         return res
 952
 953     def _set_cookie(self, domain, name, value, expire_time=None):
 954         cookie = compat_cookiejar.Cookie(
 955             0, name, value, None, None, domain, None,
 956             None, '/', True, False, expire_time, '', None, None, None)
 957         self._downloader.cookiejar.set_cookie(cookie)
 958
 959     def get_testcases(self, include_onlymatching=False):
 960         t = getattr(self, '_TEST', None)
 961         if t:
 962             assert not hasattr(self, '_TESTS'), \
 963                 '%s has _TEST and _TESTS' % type(self).__name__
 964             tests = [t]
 965         else:
 966             tests = getattr(self, '_TESTS', [])
 967         for t in tests:
 968             if not include_onlymatching and t.get('only_matching', False):
 969                 continue
 970             t['name'] = type(self).__name__[:-len('IE')]
 971             yield t
 972
 973     def is_suitable(self, age_limit):
 974         """ Test whether the extractor is generally suitable for the given
 975         age limit (i.e. pornographic sites are not, all others usually are) """
 976
 977         any_restricted = False
 978         for tc in self.get_testcases(include_onlymatching=False):
 979             if 'playlist' in tc:
 980                 tc = tc['playlist'][0]
 981             is_restricted = age_restricted(
 982                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 983             if not is_restricted:
 984                 return True
 985             any_restricted = any_restricted or is_restricted
 986         return not any_restricted
 987
 988
 989 class SearchInfoExtractor(InfoExtractor):
 990     """
 991     Base class for paged search queries extractors.
 992     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 993     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 994     """
 995
 996     @classmethod
 997     def _make_valid_url(cls):
 998         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 999
1000     @classmethod
1001     def suitable(cls, url):
1002         return re.match(cls._make_valid_url(), url) is not None
1003
1004     def _real_extract(self, query):
1005         mobj = re.match(self._make_valid_url(), query)
1006         if mobj is None:
1007             raise ExtractorError('Invalid search query "%s"' % query)
1008
1009         prefix = mobj.group('prefix')
1010         query = mobj.group('query')
1011         if prefix == '':
1012             return self._get_n_results(query, 1)
1013         elif prefix == 'all':
1014             return self._get_n_results(query, self._MAX_RESULTS)
1015         else:
1016             n = int(prefix)
1017             if n <= 0:
1018                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1019             elif n > self._MAX_RESULTS:
1020                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1021                 n = self._MAX_RESULTS
1022             return self._get_n_results(query, n)
1023
1024     def _get_n_results(self, query, n):
1025         """Get a specified number of results for a query"""
1026         raise NotImplementedError("This method must be implemented by subclasses")
1027
1028     @property
1029     def SEARCH_KEY(self):
1030         return self._SEARCH_KEY