_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     unified_strdate,
  43     url_basename,
  44     xpath_text,
  45     xpath_with_ns,
  46 )
  47
  48
  49 class InfoExtractor(object):
  50     """Information Extractor class.
  51
  52     Information extractors are the classes that, given a URL, extract
  53     information about the video (or videos) the URL refers to. This
  54     information includes the real video URL, the video title, author and
  55     others. The information is stored in a dictionary which is then
  56     passed to the YoutubeDL. The YoutubeDL processes this
  57     information possibly downloading the video to the file system, among
  58     other possible outcomes.
  59
  60     The type field determines the type of the result.
  61     By far the most common value (and the default if _type is missing) is
  62     "video", which indicates a single video.
  63
  64     For a video, the dictionaries must include the following fields:
  65
  66     id:             Video identifier.
  67     title:          Video title, unescaped.
  68
  69     Additionally, it must contain either a formats entry or a url one:
  70
  71     formats:        A list of dictionaries for each format available, ordered
  72                     from worst to best quality.
  73
  74                     Potential fields:
  75                     * url        Mandatory. The URL of the video file
  76                     * ext        Will be calculated from URL if missing
  77                     * format     A human-readable description of the format
  78                                  ("mp4 container with h264/opus").
  79                                  Calculated from the format_id, width, height.
  80                                  and format_note fields if missing.
  81                     * format_id  A short description of the format
  82                                  ("mp4_h264_opus" or "19").
  83                                 Technically optional, but strongly recommended.
  84                     * format_note Additional info about the format
  85                                  ("3D" or "DASH video")
  86                     * width      Width of the video, if known
  87                     * height     Height of the video, if known
  88                     * resolution Textual description of width and height
  89                     * tbr        Average bitrate of audio and video in KBit/s
  90                     * abr        Average audio bitrate in KBit/s
  91                     * acodec     Name of the audio codec in use
  92                     * asr        Audio sampling rate in Hertz
  93                     * vbr        Average video bitrate in KBit/s
  94                     * fps        Frame rate
  95                     * vcodec     Name of the video codec in use
  96                     * container  Name of the container format
  97                     * filesize   The number of bytes, if known in advance
  98                     * filesize_approx  An estimate for the number of bytes
  99                     * player_url SWF Player URL (used for rtmpdump).
 100                     * protocol   The protocol that will be used for the actual
 101                                  download, lower-case.
 102                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 103                                  "m3u8", or "m3u8_native".
 104                     * preference Order number of this format. If this field is
 105                                  present and not None, the formats get sorted
 106                                  by this field, regardless of all other values.
 107                                  -1 for default (order by other properties),
 108                                  -2 or smaller for less than default.
 109                                  < -1000 to hide the format (if there is
 110                                     another one which is strictly better)
 111                     * language_preference  Is this in the correct requested
 112                                  language?
 113                                  10 if it's what the URL is about,
 114                                  -1 for default (don't know),
 115                                  -10 otherwise, other values reserved for now.
 116                     * quality    Order number of the video quality of this
 117                                  format, irrespective of the file format.
 118                                  -1 for default (order by other properties),
 119                                  -2 or smaller for less than default.
 120                     * source_preference  Order number for this video source
 121                                   (quality takes higher priority)
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * http_headers  A dictionary of additional HTTP headers
 125                                  to add to the request.
 126                     * stretched_ratio  If given and not 1, indicates that the
 127                                  video's pixels are not square.
 128                                  width : height ratio as float.
 129                     * no_resume  The server does not support resuming the
 130                                  (HTTP or RTMP) download. Boolean.
 131
 132     url:            Final video URL.
 133     ext:            Video filename extension.
 134     format:         The video format, defaults to ext (used for --get-format)
 135     player_url:     SWF Player URL (used for rtmpdump).
 136
 137     The following fields are optional:
 138
 139     alt_title:      A secondary title of the video.
 140     display_id      An alternative identifier for the video, not necessarily
 141                     unique, but available before title. Typically, id is
 142                     something like "4234987", title "Dancing naked mole rats",
 143                     and display_id "dancing-naked-mole-rats"
 144     thumbnails:     A list of dictionaries, with the following entries:
 145                         * "id" (optional, string) - Thumbnail format ID
 146                         * "url"
 147                         * "preference" (optional, int) - quality of the image
 148                         * "width" (optional, int)
 149                         * "height" (optional, int)
 150                         * "resolution" (optional, string "{width}x{height"},
 151                                         deprecated)
 152     thumbnail:      Full URL to a video thumbnail image.
 153     description:    Full video description.
 154     uploader:       Full name of the video uploader.
 155     creator:        The main artist who created the video.
 156     release_date:   The date (YYYYMMDD) when the video was released.
 157     timestamp:      UNIX timestamp of the moment the video became available.
 158     upload_date:    Video upload date (YYYYMMDD).
 159                     If not explicitly set, calculated from timestamp.
 160     uploader_id:    Nickname or id of the video uploader.
 161     location:       Physical location where the video was filmed.
 162     subtitles:      The available subtitles as a dictionary in the format
 163                     {language: subformats}. "subformats" is a list sorted from
 164                     lower to higher preference, each element is a dictionary
 165                     with the "ext" entry and one of:
 166                         * "data": The subtitles file contents
 167                         * "url": A URL pointing to the subtitles file
 168     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 169                     automatically generated captions
 170     duration:       Length of the video in seconds, as an integer.
 171     view_count:     How many users have watched the video on the platform.
 172     like_count:     Number of positive ratings of the video
 173     dislike_count:  Number of negative ratings of the video
 174     average_rating: Average rating give by users, the scale used depends on the webpage
 175     comment_count:  Number of comments on the video
 176     comments:       A list of comments, each with one or more of the following
 177                     properties (all but one of text or html optional):
 178                         * "author" - human-readable name of the comment author
 179                         * "author_id" - user ID of the comment author
 180                         * "id" - Comment ID
 181                         * "html" - Comment as HTML
 182                         * "text" - Plain text of the comment
 183                         * "timestamp" - UNIX timestamp of comment
 184                         * "parent" - ID of the comment this one is replying to.
 185                                      Set to "root" to indicate that this is a
 186                                      comment to the original video.
 187     age_limit:      Age restriction for the video, as an integer (years)
 188     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 189                     should allow to get the same result again. (It will be set
 190                     by YoutubeDL if it's missing)
 191     categories:     A list of categories that the video falls in, for example
 192                     ["Sports", "Berlin"]
 193     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 194     is_live:        True, False, or None (=unknown). Whether this video is a
 195                     live stream that goes on instead of a fixed-length video.
 196     start_time:     Time in seconds where the reproduction should start, as
 197                     specified in the URL.
 198     end_time:       Time in seconds where the reproduction should end, as
 199                     specified in the URL.
 200
 201     Unless mentioned otherwise, the fields should be Unicode strings.
 202
 203     Unless mentioned otherwise, None is equivalent to absence of information.
 204
 205
 206     _type "playlist" indicates multiple videos.
 207     There must be a key "entries", which is a list, an iterable, or a PagedList
 208     object, each element of which is a valid dictionary by this specification.
 209
 210     Additionally, playlists can have "title", "description" and "id" attributes
 211     with the same semantics as videos (see above).
 212
 213
 214     _type "multi_video" indicates that there are multiple videos that
 215     form a single show, for examples multiple acts of an opera or TV episode.
 216     It must have an entries key like a playlist and contain all the keys
 217     required for a video at the same time.
 218
 219
 220     _type "url" indicates that the video must be extracted from another
 221     location, possibly by a different extractor. Its only required key is:
 222     "url" - the next URL to extract.
 223     The key "ie_key" can be set to the class name (minus the trailing "IE",
 224     e.g. "Youtube") if the extractor class is known in advance.
 225     Additionally, the dictionary may have any properties of the resolved entity
 226     known in advance, for example "title" if the title of the referred video is
 227     known ahead of time.
 228
 229
 230     _type "url_transparent" entities have the same specification as "url", but
 231     indicate that the given additional information is more precise than the one
 232     associated with the resolved URL.
 233     This is useful when a site employs a video service that hosts the video and
 234     its technical metadata, but that video service does not embed a useful
 235     title, description etc.
 236
 237
 238     Subclasses of this one should re-define the _real_initialize() and
 239     _real_extract() methods and define a _VALID_URL regexp.
 240     Probably, they should also be added to the list of extractors.
 241
 242     Finally, the _WORKING attribute should be set to False for broken IEs
 243     in order to warn the users and skip the tests.
 244     """
 245
 246     _ready = False
 247     _downloader = None
 248     _WORKING = True
 249
 250     def __init__(self, downloader=None):
 251         """Constructor. Receives an optional downloader."""
 252         self._ready = False
 253         self.set_downloader(downloader)
 254
 255     @classmethod
 256     def suitable(cls, url):
 257         """Receives a URL and returns True if suitable for this IE."""
 258
 259         # This does not use has/getattr intentionally - we want to know whether
 260         # we have cached the regexp for *this* class, whereas getattr would also
 261         # match the superclass
 262         if '_VALID_URL_RE' not in cls.__dict__:
 263             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 264         return cls._VALID_URL_RE.match(url) is not None
 265
 266     @classmethod
 267     def _match_id(cls, url):
 268         if '_VALID_URL_RE' not in cls.__dict__:
 269             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 270         m = cls._VALID_URL_RE.match(url)
 271         assert m
 272         return m.group('id')
 273
 274     @classmethod
 275     def working(cls):
 276         """Getter method for _WORKING."""
 277         return cls._WORKING
 278
 279     def initialize(self):
 280         """Initializes an instance (authentication, etc)."""
 281         if not self._ready:
 282             self._real_initialize()
 283             self._ready = True
 284
 285     def extract(self, url):
 286         """Extracts URL information and returns it in list of dicts."""
 287         try:
 288             self.initialize()
 289             return self._real_extract(url)
 290         except ExtractorError:
 291             raise
 292         except compat_http_client.IncompleteRead as e:
 293             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 294         except (KeyError, StopIteration) as e:
 295             raise ExtractorError('An extractor error has occured.', cause=e)
 296
 297     def set_downloader(self, downloader):
 298         """Sets the downloader for this IE."""
 299         self._downloader = downloader
 300
 301     def _real_initialize(self):
 302         """Real initialization process. Redefine in subclasses."""
 303         pass
 304
 305     def _real_extract(self, url):
 306         """Real extraction process. Redefine in subclasses."""
 307         pass
 308
 309     @classmethod
 310     def ie_key(cls):
 311         """A string for getting the InfoExtractor with get_info_extractor"""
 312         return cls.__name__[:-2]
 313
 314     @property
 315     def IE_NAME(self):
 316         return type(self).__name__[:-2]
 317
 318     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 319         """ Returns the response handle """
 320         if note is None:
 321             self.report_download_webpage(video_id)
 322         elif note is not False:
 323             if video_id is None:
 324                 self.to_screen('%s' % (note,))
 325             else:
 326                 self.to_screen('%s: %s' % (video_id, note))
 327         try:
 328             return self._downloader.urlopen(url_or_request)
 329         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 330             if errnote is False:
 331                 return False
 332             if errnote is None:
 333                 errnote = 'Unable to download webpage'
 334             errmsg = '%s: %s' % (errnote, compat_str(err))
 335             if fatal:
 336                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 337             else:
 338                 self._downloader.report_warning(errmsg)
 339                 return False
 340
 341     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 342         """ Returns a tuple (page content as string, URL handle) """
 343         # Strip hashes from the URL (#1038)
 344         if isinstance(url_or_request, (compat_str, str)):
 345             url_or_request = url_or_request.partition('#')[0]
 346
 347         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 348         if urlh is False:
 349             assert not fatal
 350             return False
 351         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 352         return (content, urlh)
 353
 354     @staticmethod
 355     def _guess_encoding_from_content(content_type, webpage_bytes):
 356         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 357         if m:
 358             encoding = m.group(1)
 359         else:
 360             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 361                           webpage_bytes[:1024])
 362             if m:
 363                 encoding = m.group(1).decode('ascii')
 364             elif webpage_bytes.startswith(b'\xff\xfe'):
 365                 encoding = 'utf-16'
 366             else:
 367                 encoding = 'utf-8'
 368
 369         return encoding
 370
 371     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 372         content_type = urlh.headers.get('Content-Type', '')
 373         webpage_bytes = urlh.read()
 374         if prefix is not None:
 375             webpage_bytes = prefix + webpage_bytes
 376         if not encoding:
 377             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 378         if self._downloader.params.get('dump_intermediate_pages', False):
 379             try:
 380                 url = url_or_request.get_full_url()
 381             except AttributeError:
 382                 url = url_or_request
 383             self.to_screen('Dumping request to ' + url)
 384             dump = base64.b64encode(webpage_bytes).decode('ascii')
 385             self._downloader.to_screen(dump)
 386         if self._downloader.params.get('write_pages', False):
 387             try:
 388                 url = url_or_request.get_full_url()
 389             except AttributeError:
 390                 url = url_or_request
 391             basen = '%s_%s' % (video_id, url)
 392             if len(basen) > 240:
 393                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 394                 basen = basen[:240 - len(h)] + h
 395             raw_filename = basen + '.dump'
 396             filename = sanitize_filename(raw_filename, restricted=True)
 397             self.to_screen('Saving request to ' + filename)
 398             # Working around MAX_PATH limitation on Windows (see
 399             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 400             if os.name == 'nt':
 401                 absfilepath = os.path.abspath(filename)
 402                 if len(absfilepath) > 259:
 403                     filename = '\\\\?\\' + absfilepath
 404             with open(filename, 'wb') as outf:
 405                 outf.write(webpage_bytes)
 406
 407         try:
 408             content = webpage_bytes.decode(encoding, 'replace')
 409         except LookupError:
 410             content = webpage_bytes.decode('utf-8', 'replace')
 411
 412         if ('<title>Access to this site is blocked</title>' in content and
 413                 'Websense' in content[:512]):
 414             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 415             blocked_iframe = self._html_search_regex(
 416                 r'<iframe src="([^"]+)"', content,
 417                 'Websense information URL', default=None)
 418             if blocked_iframe:
 419                 msg += ' Visit %s for more details' % blocked_iframe
 420             raise ExtractorError(msg, expected=True)
 421         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 422             msg = (
 423                 'Access to this webpage has been blocked by Indian censorship. '
 424                 'Use a VPN or proxy server (with --proxy) to route around it.')
 425             block_msg = self._html_search_regex(
 426                 r'</h1><p>(.*?)</p>',
 427                 content, 'block message', default=None)
 428             if block_msg:
 429                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 430             raise ExtractorError(msg, expected=True)
 431
 432         return content
 433
 434     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 435         """ Returns the data of the page as a string """
 436         success = False
 437         try_count = 0
 438         while success is False:
 439             try:
 440                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 441                 success = True
 442             except compat_http_client.IncompleteRead as e:
 443                 try_count += 1
 444                 if try_count >= tries:
 445                     raise e
 446                 self._sleep(timeout, video_id)
 447         if res is False:
 448             return res
 449         else:
 450             content, _ = res
 451             return content
 452
 453     def _download_xml(self, url_or_request, video_id,
 454                       note='Downloading XML', errnote='Unable to download XML',
 455                       transform_source=None, fatal=True, encoding=None):
 456         """Return the xml as an xml.etree.ElementTree.Element"""
 457         xml_string = self._download_webpage(
 458             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 459         if xml_string is False:
 460             return xml_string
 461         if transform_source:
 462             xml_string = transform_source(xml_string)
 463         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 464
 465     def _download_json(self, url_or_request, video_id,
 466                        note='Downloading JSON metadata',
 467                        errnote='Unable to download JSON metadata',
 468                        transform_source=None,
 469                        fatal=True, encoding=None):
 470         json_string = self._download_webpage(
 471             url_or_request, video_id, note, errnote, fatal=fatal,
 472             encoding=encoding)
 473         if (not fatal) and json_string is False:
 474             return None
 475         return self._parse_json(
 476             json_string, video_id, transform_source=transform_source, fatal=fatal)
 477
 478     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 479         if transform_source:
 480             json_string = transform_source(json_string)
 481         try:
 482             return json.loads(json_string)
 483         except ValueError as ve:
 484             errmsg = '%s: Failed to parse JSON ' % video_id
 485             if fatal:
 486                 raise ExtractorError(errmsg, cause=ve)
 487             else:
 488                 self.report_warning(errmsg + str(ve))
 489
 490     def report_warning(self, msg, video_id=None):
 491         idstr = '' if video_id is None else '%s: ' % video_id
 492         self._downloader.report_warning(
 493             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 494
 495     def to_screen(self, msg):
 496         """Print msg to screen, prefixing it with '[ie_name]'"""
 497         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 498
 499     def report_extraction(self, id_or_name):
 500         """Report information extraction."""
 501         self.to_screen('%s: Extracting information' % id_or_name)
 502
 503     def report_download_webpage(self, video_id):
 504         """Report webpage download."""
 505         self.to_screen('%s: Downloading webpage' % video_id)
 506
 507     def report_age_confirmation(self):
 508         """Report attempt to confirm age."""
 509         self.to_screen('Confirming age')
 510
 511     def report_login(self):
 512         """Report attempt to log in."""
 513         self.to_screen('Logging in')
 514
 515     @staticmethod
 516     def raise_login_required(msg='This video is only available for registered users'):
 517         raise ExtractorError(
 518             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 519             expected=True)
 520
 521     @staticmethod
 522     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 523         raise ExtractorError(
 524             '%s. You might want to use --proxy to workaround.' % msg,
 525             expected=True)
 526
 527     # Methods for following #608
 528     @staticmethod
 529     def url_result(url, ie=None, video_id=None, video_title=None):
 530         """Returns a URL that points to a page that should be processed"""
 531         # TODO: ie should be the class used for getting the info
 532         video_info = {'_type': 'url',
 533                       'url': url,
 534                       'ie_key': ie}
 535         if video_id is not None:
 536             video_info['id'] = video_id
 537         if video_title is not None:
 538             video_info['title'] = video_title
 539         return video_info
 540
 541     @staticmethod
 542     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 543         """Returns a playlist"""
 544         video_info = {'_type': 'playlist',
 545                       'entries': entries}
 546         if playlist_id:
 547             video_info['id'] = playlist_id
 548         if playlist_title:
 549             video_info['title'] = playlist_title
 550         if playlist_description:
 551             video_info['description'] = playlist_description
 552         return video_info
 553
 554     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 555         """
 556         Perform a regex search on the given string, using a single or a list of
 557         patterns returning the first matching group.
 558         In case of failure return a default value or raise a WARNING or a
 559         RegexNotFoundError, depending on fatal, specifying the field name.
 560         """
 561         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 562             mobj = re.search(pattern, string, flags)
 563         else:
 564             for p in pattern:
 565                 mobj = re.search(p, string, flags)
 566                 if mobj:
 567                     break
 568
 569         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 570             _name = '\033[0;34m%s\033[0m' % name
 571         else:
 572             _name = name
 573
 574         if mobj:
 575             if group is None:
 576                 # return the first matching group
 577                 return next(g for g in mobj.groups() if g is not None)
 578             else:
 579                 return mobj.group(group)
 580         elif default is not NO_DEFAULT:
 581             return default
 582         elif fatal:
 583             raise RegexNotFoundError('Unable to extract %s' % _name)
 584         else:
 585             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 586             return None
 587
 588     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 589         """
 590         Like _search_regex, but strips HTML tags and unescapes entities.
 591         """
 592         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 593         if res:
 594             return clean_html(res).strip()
 595         else:
 596             return res
 597
 598     def _get_login_info(self):
 599         """
 600         Get the login info as (username, password)
 601         It will look in the netrc file using the _NETRC_MACHINE value
 602         If there's no info available, return (None, None)
 603         """
 604         if self._downloader is None:
 605             return (None, None)
 606
 607         username = None
 608         password = None
 609         downloader_params = self._downloader.params
 610
 611         # Attempt to use provided username and password or .netrc data
 612         if downloader_params.get('username', None) is not None:
 613             username = downloader_params['username']
 614             password = downloader_params['password']
 615         elif downloader_params.get('usenetrc', False):
 616             try:
 617                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 618                 if info is not None:
 619                     username = info[0]
 620                     password = info[2]
 621                 else:
 622                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 623             except (IOError, netrc.NetrcParseError) as err:
 624                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 625
 626         return (username, password)
 627
 628     def _get_tfa_info(self, note='two-factor verification code'):
 629         """
 630         Get the two-factor authentication info
 631         TODO - asking the user will be required for sms/phone verify
 632         currently just uses the command line option
 633         If there's no info available, return None
 634         """
 635         if self._downloader is None:
 636             return None
 637         downloader_params = self._downloader.params
 638
 639         if downloader_params.get('twofactor', None) is not None:
 640             return downloader_params['twofactor']
 641
 642         return compat_getpass('Type %s and press [Return]: ' % note)
 643
 644     # Helper functions for extracting OpenGraph info
 645     @staticmethod
 646     def _og_regexes(prop):
 647         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 648         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 649         template = r'<meta[^>]+?%s[^>]+?%s'
 650         return [
 651             template % (property_re, content_re),
 652             template % (content_re, property_re),
 653         ]
 654
 655     @staticmethod
 656     def _meta_regex(prop):
 657         return r'''(?isx)<meta
 658                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 659                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 660
 661     def _og_search_property(self, prop, html, name=None, **kargs):
 662         if name is None:
 663             name = 'OpenGraph %s' % prop
 664         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 665         if escaped is None:
 666             return None
 667         return unescapeHTML(escaped)
 668
 669     def _og_search_thumbnail(self, html, **kargs):
 670         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 671
 672     def _og_search_description(self, html, **kargs):
 673         return self._og_search_property('description', html, fatal=False, **kargs)
 674
 675     def _og_search_title(self, html, **kargs):
 676         return self._og_search_property('title', html, **kargs)
 677
 678     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 679         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 680         if secure:
 681             regexes = self._og_regexes('video:secure_url') + regexes
 682         return self._html_search_regex(regexes, html, name, **kargs)
 683
 684     def _og_search_url(self, html, **kargs):
 685         return self._og_search_property('url', html, **kargs)
 686
 687     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 688         if display_name is None:
 689             display_name = name
 690         return self._html_search_regex(
 691             self._meta_regex(name),
 692             html, display_name, fatal=fatal, group='content', **kwargs)
 693
 694     def _dc_search_uploader(self, html):
 695         return self._html_search_meta('dc.creator', html, 'uploader')
 696
 697     def _rta_search(self, html):
 698         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 699         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 700                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 701                      html):
 702             return 18
 703         return 0
 704
 705     def _media_rating_search(self, html):
 706         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 707         rating = self._html_search_meta('rating', html)
 708
 709         if not rating:
 710             return None
 711
 712         RATING_TABLE = {
 713             'safe for kids': 0,
 714             'general': 8,
 715             '14 years': 14,
 716             'mature': 17,
 717             'restricted': 19,
 718         }
 719         return RATING_TABLE.get(rating.lower(), None)
 720
 721     def _family_friendly_search(self, html):
 722         # See http://schema.org/VideoObject
 723         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 724
 725         if not family_friendly:
 726             return None
 727
 728         RATING_TABLE = {
 729             '1': 0,
 730             'true': 0,
 731             '0': 18,
 732             'false': 18,
 733         }
 734         return RATING_TABLE.get(family_friendly.lower(), None)
 735
 736     def _twitter_search_player(self, html):
 737         return self._html_search_meta('twitter:player', html,
 738                                       'twitter card player')
 739
 740     @staticmethod
 741     def _hidden_inputs(html):
 742         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 743         hidden_inputs = {}
 744         for input in re.findall(r'(?i)<input([^>]+)>', html):
 745             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 746                 continue
 747             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 748             if not name:
 749                 continue
 750             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 751             if not value:
 752                 continue
 753             hidden_inputs[name.group('value')] = value.group('value')
 754         return hidden_inputs
 755
 756     def _form_hidden_inputs(self, form_id, html):
 757         form = self._search_regex(
 758             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 759             html, '%s form' % form_id, group='form')
 760         return self._hidden_inputs(form)
 761
 762     def _sort_formats(self, formats, field_preference=None):
 763         if not formats:
 764             raise ExtractorError('No video formats found')
 765
 766         def _formats_key(f):
 767             # TODO remove the following workaround
 768             from ..utils import determine_ext
 769             if not f.get('ext') and 'url' in f:
 770                 f['ext'] = determine_ext(f['url'])
 771
 772             if isinstance(field_preference, (list, tuple)):
 773                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 774
 775             preference = f.get('preference')
 776             if preference is None:
 777                 proto = f.get('protocol')
 778                 if proto is None:
 779                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 780
 781                 preference = 0 if proto in ['http', 'https'] else -0.1
 782                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 783                     preference -= 0.5
 784
 785             if f.get('vcodec') == 'none':  # audio only
 786                 if self._downloader.params.get('prefer_free_formats'):
 787                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 788                 else:
 789                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 790                 ext_preference = 0
 791                 try:
 792                     audio_ext_preference = ORDER.index(f['ext'])
 793                 except ValueError:
 794                     audio_ext_preference = -1
 795             else:
 796                 if self._downloader.params.get('prefer_free_formats'):
 797                     ORDER = ['flv', 'mp4', 'webm']
 798                 else:
 799                     ORDER = ['webm', 'flv', 'mp4']
 800                 try:
 801                     ext_preference = ORDER.index(f['ext'])
 802                 except ValueError:
 803                     ext_preference = -1
 804                 audio_ext_preference = 0
 805
 806             return (
 807                 preference,
 808                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 809                 f.get('quality') if f.get('quality') is not None else -1,
 810                 f.get('tbr') if f.get('tbr') is not None else -1,
 811                 f.get('filesize') if f.get('filesize') is not None else -1,
 812                 f.get('vbr') if f.get('vbr') is not None else -1,
 813                 f.get('height') if f.get('height') is not None else -1,
 814                 f.get('width') if f.get('width') is not None else -1,
 815                 ext_preference,
 816                 f.get('abr') if f.get('abr') is not None else -1,
 817                 audio_ext_preference,
 818                 f.get('fps') if f.get('fps') is not None else -1,
 819                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 820                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 821                 f.get('format_id') if f.get('format_id') is not None else '',
 822             )
 823         formats.sort(key=_formats_key)
 824
 825     def _check_formats(self, formats, video_id):
 826         if formats:
 827             formats[:] = filter(
 828                 lambda f: self._is_valid_url(
 829                     f['url'], video_id,
 830                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 831                 formats)
 832
 833     def _is_valid_url(self, url, video_id, item='video'):
 834         url = self._proto_relative_url(url, scheme='http:')
 835         # For now assume non HTTP(S) URLs always valid
 836         if not (url.startswith('http://') or url.startswith('https://')):
 837             return True
 838         try:
 839             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 840             return True
 841         except ExtractorError as e:
 842             if isinstance(e.cause, compat_HTTPError):
 843                 self.to_screen(
 844                     '%s: %s URL is invalid, skipping' % (video_id, item))
 845                 return False
 846             raise
 847
 848     def http_scheme(self):
 849         """ Either "http:" or "https:", depending on the user's preferences """
 850         return (
 851             'http:'
 852             if self._downloader.params.get('prefer_insecure', False)
 853             else 'https:')
 854
 855     def _proto_relative_url(self, url, scheme=None):
 856         if url is None:
 857             return url
 858         if url.startswith('//'):
 859             if scheme is None:
 860                 scheme = self.http_scheme()
 861             return scheme + url
 862         else:
 863             return url
 864
 865     def _sleep(self, timeout, video_id, msg_template=None):
 866         if msg_template is None:
 867             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 868         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 869         self.to_screen(msg)
 870         time.sleep(timeout)
 871
 872     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 873                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 874                              fatal=True):
 875         manifest = self._download_xml(
 876             manifest_url, video_id, 'Downloading f4m manifest',
 877             'Unable to download f4m manifest',
 878             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 879             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 880             transform_source=transform_source,
 881             fatal=fatal)
 882
 883         if manifest is False:
 884             return manifest
 885
 886         formats = []
 887         manifest_version = '1.0'
 888         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 889         if not media_nodes:
 890             manifest_version = '2.0'
 891             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 892         for i, media_el in enumerate(media_nodes):
 893             if manifest_version == '2.0':
 894                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 895                 if not media_url:
 896                     continue
 897                 manifest_url = (
 898                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 899                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 900                 # If media_url is itself a f4m manifest do the recursive extraction
 901                 # since bitrates in parent manifest (this one) and media_url manifest
 902                 # may differ leading to inability to resolve the format by requested
 903                 # bitrate in f4m downloader
 904                 if determine_ext(manifest_url) == 'f4m':
 905                     f4m_formats = self._extract_f4m_formats(
 906                         manifest_url, video_id, preference, f4m_id, fatal=fatal)
 907                     if f4m_formats:
 908                         formats.extend(f4m_formats)
 909                     continue
 910             tbr = int_or_none(media_el.attrib.get('bitrate'))
 911             formats.append({
 912                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 913                 'url': manifest_url,
 914                 'ext': 'flv',
 915                 'tbr': tbr,
 916                 'width': int_or_none(media_el.attrib.get('width')),
 917                 'height': int_or_none(media_el.attrib.get('height')),
 918                 'preference': preference,
 919             })
 920         self._sort_formats(formats)
 921
 922         return formats
 923
 924     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 925                               entry_protocol='m3u8', preference=None,
 926                               m3u8_id=None, note=None, errnote=None,
 927                               fatal=True):
 928
 929         formats = [{
 930             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 931             'url': m3u8_url,
 932             'ext': ext,
 933             'protocol': 'm3u8',
 934             'preference': preference - 1 if preference else -1,
 935             'resolution': 'multiple',
 936             'format_note': 'Quality selection URL',
 937         }]
 938
 939         format_url = lambda u: (
 940             u
 941             if re.match(r'^https?://', u)
 942             else compat_urlparse.urljoin(m3u8_url, u))
 943
 944         m3u8_doc = self._download_webpage(
 945             m3u8_url, video_id,
 946             note=note or 'Downloading m3u8 information',
 947             errnote=errnote or 'Failed to download m3u8 information',
 948             fatal=fatal)
 949         if m3u8_doc is False:
 950             return m3u8_doc
 951         last_info = None
 952         last_media = None
 953         kv_rex = re.compile(
 954             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 955         for line in m3u8_doc.splitlines():
 956             if line.startswith('#EXT-X-STREAM-INF:'):
 957                 last_info = {}
 958                 for m in kv_rex.finditer(line):
 959                     v = m.group('val')
 960                     if v.startswith('"'):
 961                         v = v[1:-1]
 962                     last_info[m.group('key')] = v
 963             elif line.startswith('#EXT-X-MEDIA:'):
 964                 last_media = {}
 965                 for m in kv_rex.finditer(line):
 966                     v = m.group('val')
 967                     if v.startswith('"'):
 968                         v = v[1:-1]
 969                     last_media[m.group('key')] = v
 970             elif line.startswith('#') or not line.strip():
 971                 continue
 972             else:
 973                 if last_info is None:
 974                     formats.append({'url': format_url(line)})
 975                     continue
 976                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 977                 format_id = []
 978                 if m3u8_id:
 979                     format_id.append(m3u8_id)
 980                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 981                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 982                 f = {
 983                     'format_id': '-'.join(format_id),
 984                     'url': format_url(line.strip()),
 985                     'tbr': tbr,
 986                     'ext': ext,
 987                     'protocol': entry_protocol,
 988                     'preference': preference,
 989                 }
 990                 codecs = last_info.get('CODECS')
 991                 if codecs:
 992                     # TODO: looks like video codec is not always necessarily goes first
 993                     va_codecs = codecs.split(',')
 994                     if va_codecs[0]:
 995                         f['vcodec'] = va_codecs[0].partition('.')[0]
 996                     if len(va_codecs) > 1 and va_codecs[1]:
 997                         f['acodec'] = va_codecs[1].partition('.')[0]
 998                 resolution = last_info.get('RESOLUTION')
 999                 if resolution:
1000                     width_str, height_str = resolution.split('x')
1001                     f['width'] = int(width_str)
1002                     f['height'] = int(height_str)
1003                 if last_media is not None:
1004                     f['m3u8_media'] = last_media
1005                     last_media = None
1006                 formats.append(f)
1007                 last_info = {}
1008         self._sort_formats(formats)
1009         return formats
1010
1011     @staticmethod
1012     def _xpath_ns(path, namespace=None):
1013         if not namespace:
1014             return path
1015         out = []
1016         for c in path.split('/'):
1017             if not c or c == '.':
1018                 out.append(c)
1019             else:
1020                 out.append('{%s}%s' % (namespace, c))
1021         return '/'.join(out)
1022
1023     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1024         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1025
1026         if smil is False:
1027             assert not fatal
1028             return []
1029
1030         namespace = self._parse_smil_namespace(smil)
1031
1032         return self._parse_smil_formats(
1033             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1034
1035     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1036         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1037         if smil is False:
1038             return {}
1039         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1040
1041     def _download_smil(self, smil_url, video_id, fatal=True):
1042         return self._download_xml(
1043             smil_url, video_id, 'Downloading SMIL file',
1044             'Unable to download SMIL file', fatal=fatal)
1045
1046     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1047         namespace = self._parse_smil_namespace(smil)
1048
1049         formats = self._parse_smil_formats(
1050             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1051         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1052
1053         video_id = os.path.splitext(url_basename(smil_url))[0]
1054         title = None
1055         description = None
1056         upload_date = None
1057         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1058             name = meta.attrib.get('name')
1059             content = meta.attrib.get('content')
1060             if not name or not content:
1061                 continue
1062             if not title and name == 'title':
1063                 title = content
1064             elif not description and name in ('description', 'abstract'):
1065                 description = content
1066             elif not upload_date and name == 'date':
1067                 upload_date = unified_strdate(content)
1068
1069         thumbnails = [{
1070             'id': image.get('type'),
1071             'url': image.get('src'),
1072             'width': int_or_none(image.get('width')),
1073             'height': int_or_none(image.get('height')),
1074         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1075
1076         return {
1077             'id': video_id,
1078             'title': title or video_id,
1079             'description': description,
1080             'upload_date': upload_date,
1081             'thumbnails': thumbnails,
1082             'formats': formats,
1083             'subtitles': subtitles,
1084         }
1085
1086     def _parse_smil_namespace(self, smil):
1087         return self._search_regex(
1088             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1089
1090     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1091         base = smil_url
1092         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1093             b = meta.get('base') or meta.get('httpBase')
1094             if b:
1095                 base = b
1096                 break
1097
1098         formats = []
1099         rtmp_count = 0
1100         http_count = 0
1101
1102         videos = smil.findall(self._xpath_ns('.//video', namespace))
1103         for video in videos:
1104             src = video.get('src')
1105             if not src:
1106                 continue
1107
1108             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1109             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1110             width = int_or_none(video.get('width'))
1111             height = int_or_none(video.get('height'))
1112             proto = video.get('proto')
1113             ext = video.get('ext')
1114             src_ext = determine_ext(src)
1115             streamer = video.get('streamer') or base
1116
1117             if proto == 'rtmp' or streamer.startswith('rtmp'):
1118                 rtmp_count += 1
1119                 formats.append({
1120                     'url': streamer,
1121                     'play_path': src,
1122                     'ext': 'flv',
1123                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1124                     'tbr': bitrate,
1125                     'filesize': filesize,
1126                     'width': width,
1127                     'height': height,
1128                 })
1129                 if transform_rtmp_url:
1130                     streamer, src = transform_rtmp_url(streamer, src)
1131                     formats[-1].update({
1132                         'url': streamer,
1133                         'play_path': src,
1134                     })
1135                 continue
1136
1137             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1138
1139             if proto == 'm3u8' or src_ext == 'm3u8':
1140                 m3u8_formats = self._extract_m3u8_formats(
1141                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1142                 if m3u8_formats:
1143                     formats.extend(m3u8_formats)
1144                 continue
1145
1146             if src_ext == 'f4m':
1147                 f4m_url = src_url
1148                 if not f4m_params:
1149                     f4m_params = {
1150                         'hdcore': '3.2.0',
1151                         'plugin': 'flowplayer-3.2.0.1',
1152                     }
1153                 f4m_url += '&' if '?' in f4m_url else '?'
1154                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1155                 f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
1156                 if f4m_formats:
1157                     formats.extend(f4m_formats)
1158                 continue
1159
1160             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1161                 http_count += 1
1162                 formats.append({
1163                     'url': src_url,
1164                     'ext': ext or src_ext or 'flv',
1165                     'format_id': 'http-%d' % (bitrate or http_count),
1166                     'tbr': bitrate,
1167                     'filesize': filesize,
1168                     'width': width,
1169                     'height': height,
1170                 })
1171                 continue
1172
1173         self._sort_formats(formats)
1174
1175         return formats
1176
1177     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1178         subtitles = {}
1179         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1180             src = textstream.get('src')
1181             if not src:
1182                 continue
1183             ext = textstream.get('ext') or determine_ext(src)
1184             if not ext:
1185                 type_ = textstream.get('type')
1186                 SUBTITLES_TYPES = {
1187                     'text/vtt': 'vtt',
1188                     'text/srt': 'srt',
1189                     'application/smptett+xml': 'tt',
1190                 }
1191                 if type_ in SUBTITLES_TYPES:
1192                     ext = SUBTITLES_TYPES[type_]
1193             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1194             subtitles.setdefault(lang, []).append({
1195                 'url': src,
1196                 'ext': ext,
1197             })
1198         return subtitles
1199
1200     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1201         xspf = self._download_xml(
1202             playlist_url, playlist_id, 'Downloading xpsf playlist',
1203             'Unable to download xspf manifest', fatal=fatal)
1204         if xspf is False:
1205             return []
1206         return self._parse_xspf(xspf, playlist_id)
1207
1208     def _parse_xspf(self, playlist, playlist_id):
1209         NS_MAP = {
1210             'xspf': 'http://xspf.org/ns/0/',
1211             's1': 'http://static.streamone.nl/player/ns/0',
1212         }
1213
1214         entries = []
1215         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1216             title = xpath_text(
1217                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1218             description = xpath_text(
1219                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1220             thumbnail = xpath_text(
1221                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1222             duration = float_or_none(
1223                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1224
1225             formats = [{
1226                 'url': location.text,
1227                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1228                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1229                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1230             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1231             self._sort_formats(formats)
1232
1233             entries.append({
1234                 'id': playlist_id,
1235                 'title': title,
1236                 'description': description,
1237                 'thumbnail': thumbnail,
1238                 'duration': duration,
1239                 'formats': formats,
1240             })
1241         return entries
1242
1243     def _live_title(self, name):
1244         """ Generate the title for a live video """
1245         now = datetime.datetime.now()
1246         now_str = now.strftime("%Y-%m-%d %H:%M")
1247         return name + ' ' + now_str
1248
1249     def _int(self, v, name, fatal=False, **kwargs):
1250         res = int_or_none(v, **kwargs)
1251         if 'get_attr' in kwargs:
1252             print(getattr(v, kwargs['get_attr']))
1253         if res is None:
1254             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1255             if fatal:
1256                 raise ExtractorError(msg)
1257             else:
1258                 self._downloader.report_warning(msg)
1259         return res
1260
1261     def _float(self, v, name, fatal=False, **kwargs):
1262         res = float_or_none(v, **kwargs)
1263         if res is None:
1264             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1265             if fatal:
1266                 raise ExtractorError(msg)
1267             else:
1268                 self._downloader.report_warning(msg)
1269         return res
1270
1271     def _set_cookie(self, domain, name, value, expire_time=None):
1272         cookie = compat_cookiejar.Cookie(
1273             0, name, value, None, None, domain, None,
1274             None, '/', True, False, expire_time, '', None, None, None)
1275         self._downloader.cookiejar.set_cookie(cookie)
1276
1277     def _get_cookies(self, url):
1278         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1279         req = compat_urllib_request.Request(url)
1280         self._downloader.cookiejar.add_cookie_header(req)
1281         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1282
1283     def get_testcases(self, include_onlymatching=False):
1284         t = getattr(self, '_TEST', None)
1285         if t:
1286             assert not hasattr(self, '_TESTS'), \
1287                 '%s has _TEST and _TESTS' % type(self).__name__
1288             tests = [t]
1289         else:
1290             tests = getattr(self, '_TESTS', [])
1291         for t in tests:
1292             if not include_onlymatching and t.get('only_matching', False):
1293                 continue
1294             t['name'] = type(self).__name__[:-len('IE')]
1295             yield t
1296
1297     def is_suitable(self, age_limit):
1298         """ Test whether the extractor is generally suitable for the given
1299         age limit (i.e. pornographic sites are not, all others usually are) """
1300
1301         any_restricted = False
1302         for tc in self.get_testcases(include_onlymatching=False):
1303             if 'playlist' in tc:
1304                 tc = tc['playlist'][0]
1305             is_restricted = age_restricted(
1306                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1307             if not is_restricted:
1308                 return True
1309             any_restricted = any_restricted or is_restricted
1310         return not any_restricted
1311
1312     def extract_subtitles(self, *args, **kwargs):
1313         if (self._downloader.params.get('writesubtitles', False) or
1314                 self._downloader.params.get('listsubtitles')):
1315             return self._get_subtitles(*args, **kwargs)
1316         return {}
1317
1318     def _get_subtitles(self, *args, **kwargs):
1319         raise NotImplementedError("This method must be implemented by subclasses")
1320
1321     @staticmethod
1322     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1323         """ Merge subtitle items for one language. Items with duplicated URLs
1324         will be dropped. """
1325         list1_urls = set([item['url'] for item in subtitle_list1])
1326         ret = list(subtitle_list1)
1327         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1328         return ret
1329
1330     @classmethod
1331     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1332         """ Merge two subtitle dictionaries, language by language. """
1333         ret = dict(subtitle_dict1)
1334         for lang in subtitle_dict2:
1335             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1336         return ret
1337
1338     def extract_automatic_captions(self, *args, **kwargs):
1339         if (self._downloader.params.get('writeautomaticsub', False) or
1340                 self._downloader.params.get('listsubtitles')):
1341             return self._get_automatic_captions(*args, **kwargs)
1342         return {}
1343
1344     def _get_automatic_captions(self, *args, **kwargs):
1345         raise NotImplementedError("This method must be implemented by subclasses")
1346
1347
1348 class SearchInfoExtractor(InfoExtractor):
1349     """
1350     Base class for paged search queries extractors.
1351     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1352     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1353     """
1354
1355     @classmethod
1356     def _make_valid_url(cls):
1357         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1358
1359     @classmethod
1360     def suitable(cls, url):
1361         return re.match(cls._make_valid_url(), url) is not None
1362
1363     def _real_extract(self, query):
1364         mobj = re.match(self._make_valid_url(), query)
1365         if mobj is None:
1366             raise ExtractorError('Invalid search query "%s"' % query)
1367
1368         prefix = mobj.group('prefix')
1369         query = mobj.group('query')
1370         if prefix == '':
1371             return self._get_n_results(query, 1)
1372         elif prefix == 'all':
1373             return self._get_n_results(query, self._MAX_RESULTS)
1374         else:
1375             n = int(prefix)
1376             if n <= 0:
1377                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1378             elif n > self._MAX_RESULTS:
1379                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1380                 n = self._MAX_RESULTS
1381             return self._get_n_results(query, n)
1382
1383     def _get_n_results(self, query, n):
1384         """Get a specified number of results for a query"""
1385         raise NotImplementedError("This method must be implemented by subclasses")
1386
1387     @property
1388     def SEARCH_KEY(self):
1389         return self._SEARCH_KEY