git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13
  14 from ..compat import (
  15     compat_cookiejar,
  16     compat_cookies,
  17     compat_getpass,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse,
  21     compat_urlparse,
  22     compat_str,
  23     compat_etree_fromstring,
  24 )
  25 from ..utils import (
  26     NO_DEFAULT,
  27     age_restricted,
  28     bug_reports_message,
  29     clean_html,
  30     compiled_regex_type,
  31     determine_ext,
  32     error_to_compat_str,
  33     ExtractorError,
  34     fix_xml_ampersands,
  35     float_or_none,
  36     int_or_none,
  37     RegexNotFoundError,
  38     sanitize_filename,
  39     sanitized_Request,
  40     unescapeHTML,
  41     unified_strdate,
  42     url_basename,
  43     xpath_text,
  44     xpath_with_ns,
  45     determine_protocol,
  46 )
  47
  48
  49 class InfoExtractor(object):
  50     """Information Extractor class.
  51
  52     Information extractors are the classes that, given a URL, extract
  53     information about the video (or videos) the URL refers to. This
  54     information includes the real video URL, the video title, author and
  55     others. The information is stored in a dictionary which is then
  56     passed to the YoutubeDL. The YoutubeDL processes this
  57     information possibly downloading the video to the file system, among
  58     other possible outcomes.
  59
  60     The type field determines the type of the result.
  61     By far the most common value (and the default if _type is missing) is
  62     "video", which indicates a single video.
  63
  64     For a video, the dictionaries must include the following fields:
  65
  66     id:             Video identifier.
  67     title:          Video title, unescaped.
  68
  69     Additionally, it must contain either a formats entry or a url one:
  70
  71     formats:        A list of dictionaries for each format available, ordered
  72                     from worst to best quality.
  73
  74                     Potential fields:
  75                     * url        Mandatory. The URL of the video file
  76                     * ext        Will be calculated from URL if missing
  77                     * format     A human-readable description of the format
  78                                  ("mp4 container with h264/opus").
  79                                  Calculated from the format_id, width, height.
  80                                  and format_note fields if missing.
  81                     * format_id  A short description of the format
  82                                  ("mp4_h264_opus" or "19").
  83                                 Technically optional, but strongly recommended.
  84                     * format_note Additional info about the format
  85                                  ("3D" or "DASH video")
  86                     * width      Width of the video, if known
  87                     * height     Height of the video, if known
  88                     * resolution Textual description of width and height
  89                     * tbr        Average bitrate of audio and video in KBit/s
  90                     * abr        Average audio bitrate in KBit/s
  91                     * acodec     Name of the audio codec in use
  92                     * asr        Audio sampling rate in Hertz
  93                     * vbr        Average video bitrate in KBit/s
  94                     * fps        Frame rate
  95                     * vcodec     Name of the video codec in use
  96                     * container  Name of the container format
  97                     * filesize   The number of bytes, if known in advance
  98                     * filesize_approx  An estimate for the number of bytes
  99                     * player_url SWF Player URL (used for rtmpdump).
 100                     * protocol   The protocol that will be used for the actual
 101                                  download, lower-case.
 102                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 103                                  "m3u8", or "m3u8_native".
 104                     * preference Order number of this format. If this field is
 105                                  present and not None, the formats get sorted
 106                                  by this field, regardless of all other values.
 107                                  -1 for default (order by other properties),
 108                                  -2 or smaller for less than default.
 109                                  < -1000 to hide the format (if there is
 110                                     another one which is strictly better)
 111                     * language   Language code, e.g. "de" or "en-US".
 112                     * language_preference  Is this in the language mentioned in
 113                                  the URL?
 114                                  10 if it's what the URL is about,
 115                                  -1 for default (don't know),
 116                                  -10 otherwise, other values reserved for now.
 117                     * quality    Order number of the video quality of this
 118                                  format, irrespective of the file format.
 119                                  -1 for default (order by other properties),
 120                                  -2 or smaller for less than default.
 121                     * source_preference  Order number for this video source
 122                                   (quality takes higher priority)
 123                                  -1 for default (order by other properties),
 124                                  -2 or smaller for less than default.
 125                     * http_headers  A dictionary of additional HTTP headers
 126                                  to add to the request.
 127                     * stretched_ratio  If given and not 1, indicates that the
 128                                  video's pixels are not square.
 129                                  width : height ratio as float.
 130                     * no_resume  The server does not support resuming the
 131                                  (HTTP or RTMP) download. Boolean.
 132
 133     url:            Final video URL.
 134     ext:            Video filename extension.
 135     format:         The video format, defaults to ext (used for --get-format)
 136     player_url:     SWF Player URL (used for rtmpdump).
 137
 138     The following fields are optional:
 139
 140     alt_title:      A secondary title of the video.
 141     display_id      An alternative identifier for the video, not necessarily
 142                     unique, but available before title. Typically, id is
 143                     something like "4234987", title "Dancing naked mole rats",
 144                     and display_id "dancing-naked-mole-rats"
 145     thumbnails:     A list of dictionaries, with the following entries:
 146                         * "id" (optional, string) - Thumbnail format ID
 147                         * "url"
 148                         * "preference" (optional, int) - quality of the image
 149                         * "width" (optional, int)
 150                         * "height" (optional, int)
 151                         * "resolution" (optional, string "{width}x{height"},
 152                                         deprecated)
 153     thumbnail:      Full URL to a video thumbnail image.
 154     description:    Full video description.
 155     uploader:       Full name of the video uploader.
 156     creator:        The main artist who created the video.
 157     release_date:   The date (YYYYMMDD) when the video was released.
 158     timestamp:      UNIX timestamp of the moment the video became available.
 159     upload_date:    Video upload date (YYYYMMDD).
 160                     If not explicitly set, calculated from timestamp.
 161     uploader_id:    Nickname or id of the video uploader.
 162     location:       Physical location where the video was filmed.
 163     subtitles:      The available subtitles as a dictionary in the format
 164                     {language: subformats}. "subformats" is a list sorted from
 165                     lower to higher preference, each element is a dictionary
 166                     with the "ext" entry and one of:
 167                         * "data": The subtitles file contents
 168                         * "url": A URL pointing to the subtitles file
 169                     "ext" will be calculated from URL if missing
 170     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 171                     automatically generated captions
 172     duration:       Length of the video in seconds, as an integer or float.
 173     view_count:     How many users have watched the video on the platform.
 174     like_count:     Number of positive ratings of the video
 175     dislike_count:  Number of negative ratings of the video
 176     repost_count:   Number of reposts of the video
 177     average_rating: Average rating give by users, the scale used depends on the webpage
 178     comment_count:  Number of comments on the video
 179     comments:       A list of comments, each with one or more of the following
 180                     properties (all but one of text or html optional):
 181                         * "author" - human-readable name of the comment author
 182                         * "author_id" - user ID of the comment author
 183                         * "id" - Comment ID
 184                         * "html" - Comment as HTML
 185                         * "text" - Plain text of the comment
 186                         * "timestamp" - UNIX timestamp of comment
 187                         * "parent" - ID of the comment this one is replying to.
 188                                      Set to "root" to indicate that this is a
 189                                      comment to the original video.
 190     age_limit:      Age restriction for the video, as an integer (years)
 191     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 192                     should allow to get the same result again. (It will be set
 193                     by YoutubeDL if it's missing)
 194     categories:     A list of categories that the video falls in, for example
 195                     ["Sports", "Berlin"]
 196     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 197     is_live:        True, False, or None (=unknown). Whether this video is a
 198                     live stream that goes on instead of a fixed-length video.
 199     start_time:     Time in seconds where the reproduction should start, as
 200                     specified in the URL.
 201     end_time:       Time in seconds where the reproduction should end, as
 202                     specified in the URL.
 203
 204     Unless mentioned otherwise, the fields should be Unicode strings.
 205
 206     Unless mentioned otherwise, None is equivalent to absence of information.
 207
 208
 209     _type "playlist" indicates multiple videos.
 210     There must be a key "entries", which is a list, an iterable, or a PagedList
 211     object, each element of which is a valid dictionary by this specification.
 212
 213     Additionally, playlists can have "title", "description" and "id" attributes
 214     with the same semantics as videos (see above).
 215
 216
 217     _type "multi_video" indicates that there are multiple videos that
 218     form a single show, for examples multiple acts of an opera or TV episode.
 219     It must have an entries key like a playlist and contain all the keys
 220     required for a video at the same time.
 221
 222
 223     _type "url" indicates that the video must be extracted from another
 224     location, possibly by a different extractor. Its only required key is:
 225     "url" - the next URL to extract.
 226     The key "ie_key" can be set to the class name (minus the trailing "IE",
 227     e.g. "Youtube") if the extractor class is known in advance.
 228     Additionally, the dictionary may have any properties of the resolved entity
 229     known in advance, for example "title" if the title of the referred video is
 230     known ahead of time.
 231
 232
 233     _type "url_transparent" entities have the same specification as "url", but
 234     indicate that the given additional information is more precise than the one
 235     associated with the resolved URL.
 236     This is useful when a site employs a video service that hosts the video and
 237     its technical metadata, but that video service does not embed a useful
 238     title, description etc.
 239
 240
 241     Subclasses of this one should re-define the _real_initialize() and
 242     _real_extract() methods and define a _VALID_URL regexp.
 243     Probably, they should also be added to the list of extractors.
 244
 245     Finally, the _WORKING attribute should be set to False for broken IEs
 246     in order to warn the users and skip the tests.
 247     """
 248
 249     _ready = False
 250     _downloader = None
 251     _WORKING = True
 252
 253     def __init__(self, downloader=None):
 254         """Constructor. Receives an optional downloader."""
 255         self._ready = False
 256         self.set_downloader(downloader)
 257
 258     @classmethod
 259     def suitable(cls, url):
 260         """Receives a URL and returns True if suitable for this IE."""
 261
 262         # This does not use has/getattr intentionally - we want to know whether
 263         # we have cached the regexp for *this* class, whereas getattr would also
 264         # match the superclass
 265         if '_VALID_URL_RE' not in cls.__dict__:
 266             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 267         return cls._VALID_URL_RE.match(url) is not None
 268
 269     @classmethod
 270     def _match_id(cls, url):
 271         if '_VALID_URL_RE' not in cls.__dict__:
 272             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 273         m = cls._VALID_URL_RE.match(url)
 274         assert m
 275         return m.group('id')
 276
 277     @classmethod
 278     def working(cls):
 279         """Getter method for _WORKING."""
 280         return cls._WORKING
 281
 282     def initialize(self):
 283         """Initializes an instance (authentication, etc)."""
 284         if not self._ready:
 285             self._real_initialize()
 286             self._ready = True
 287
 288     def extract(self, url):
 289         """Extracts URL information and returns it in list of dicts."""
 290         try:
 291             self.initialize()
 292             return self._real_extract(url)
 293         except ExtractorError:
 294             raise
 295         except compat_http_client.IncompleteRead as e:
 296             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 297         except (KeyError, StopIteration) as e:
 298             raise ExtractorError('An extractor error has occured.', cause=e)
 299
 300     def set_downloader(self, downloader):
 301         """Sets the downloader for this IE."""
 302         self._downloader = downloader
 303
 304     def _real_initialize(self):
 305         """Real initialization process. Redefine in subclasses."""
 306         pass
 307
 308     def _real_extract(self, url):
 309         """Real extraction process. Redefine in subclasses."""
 310         pass
 311
 312     @classmethod
 313     def ie_key(cls):
 314         """A string for getting the InfoExtractor with get_info_extractor"""
 315         return compat_str(cls.__name__[:-2])
 316
 317     @property
 318     def IE_NAME(self):
 319         return compat_str(type(self).__name__[:-2])
 320
 321     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 322         """ Returns the response handle """
 323         if note is None:
 324             self.report_download_webpage(video_id)
 325         elif note is not False:
 326             if video_id is None:
 327                 self.to_screen('%s' % (note,))
 328             else:
 329                 self.to_screen('%s: %s' % (video_id, note))
 330         try:
 331             return self._downloader.urlopen(url_or_request)
 332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 333             if errnote is False:
 334                 return False
 335             if errnote is None:
 336                 errnote = 'Unable to download webpage'
 337
 338             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 339             if fatal:
 340                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 341             else:
 342                 self._downloader.report_warning(errmsg)
 343                 return False
 344
 345     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 346         """ Returns a tuple (page content as string, URL handle) """
 347         # Strip hashes from the URL (#1038)
 348         if isinstance(url_or_request, (compat_str, str)):
 349             url_or_request = url_or_request.partition('#')[0]
 350
 351         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 352         if urlh is False:
 353             assert not fatal
 354             return False
 355         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 356         return (content, urlh)
 357
 358     @staticmethod
 359     def _guess_encoding_from_content(content_type, webpage_bytes):
 360         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 361         if m:
 362             encoding = m.group(1)
 363         else:
 364             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 365                           webpage_bytes[:1024])
 366             if m:
 367                 encoding = m.group(1).decode('ascii')
 368             elif webpage_bytes.startswith(b'\xff\xfe'):
 369                 encoding = 'utf-16'
 370             else:
 371                 encoding = 'utf-8'
 372
 373         return encoding
 374
 375     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 376         content_type = urlh.headers.get('Content-Type', '')
 377         webpage_bytes = urlh.read()
 378         if prefix is not None:
 379             webpage_bytes = prefix + webpage_bytes
 380         if not encoding:
 381             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 382         if self._downloader.params.get('dump_intermediate_pages', False):
 383             try:
 384                 url = url_or_request.get_full_url()
 385             except AttributeError:
 386                 url = url_or_request
 387             self.to_screen('Dumping request to ' + url)
 388             dump = base64.b64encode(webpage_bytes).decode('ascii')
 389             self._downloader.to_screen(dump)
 390         if self._downloader.params.get('write_pages', False):
 391             try:
 392                 url = url_or_request.get_full_url()
 393             except AttributeError:
 394                 url = url_or_request
 395             basen = '%s_%s' % (video_id, url)
 396             if len(basen) > 240:
 397                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 398                 basen = basen[:240 - len(h)] + h
 399             raw_filename = basen + '.dump'
 400             filename = sanitize_filename(raw_filename, restricted=True)
 401             self.to_screen('Saving request to ' + filename)
 402             # Working around MAX_PATH limitation on Windows (see
 403             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 404             if os.name == 'nt':
 405                 absfilepath = os.path.abspath(filename)
 406                 if len(absfilepath) > 259:
 407                     filename = '\\\\?\\' + absfilepath
 408             with open(filename, 'wb') as outf:
 409                 outf.write(webpage_bytes)
 410
 411         try:
 412             content = webpage_bytes.decode(encoding, 'replace')
 413         except LookupError:
 414             content = webpage_bytes.decode('utf-8', 'replace')
 415
 416         if ('<title>Access to this site is blocked</title>' in content and
 417                 'Websense' in content[:512]):
 418             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 419             blocked_iframe = self._html_search_regex(
 420                 r'<iframe src="([^"]+)"', content,
 421                 'Websense information URL', default=None)
 422             if blocked_iframe:
 423                 msg += ' Visit %s for more details' % blocked_iframe
 424             raise ExtractorError(msg, expected=True)
 425         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 426             msg = (
 427                 'Access to this webpage has been blocked by Indian censorship. '
 428                 'Use a VPN or proxy server (with --proxy) to route around it.')
 429             block_msg = self._html_search_regex(
 430                 r'</h1><p>(.*?)</p>',
 431                 content, 'block message', default=None)
 432             if block_msg:
 433                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 434             raise ExtractorError(msg, expected=True)
 435
 436         return content
 437
 438     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 439         """ Returns the data of the page as a string """
 440         success = False
 441         try_count = 0
 442         while success is False:
 443             try:
 444                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 445                 success = True
 446             except compat_http_client.IncompleteRead as e:
 447                 try_count += 1
 448                 if try_count >= tries:
 449                     raise e
 450                 self._sleep(timeout, video_id)
 451         if res is False:
 452             return res
 453         else:
 454             content, _ = res
 455             return content
 456
 457     def _download_xml(self, url_or_request, video_id,
 458                       note='Downloading XML', errnote='Unable to download XML',
 459                       transform_source=None, fatal=True, encoding=None):
 460         """Return the xml as an xml.etree.ElementTree.Element"""
 461         xml_string = self._download_webpage(
 462             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 463         if xml_string is False:
 464             return xml_string
 465         if transform_source:
 466             xml_string = transform_source(xml_string)
 467         return compat_etree_fromstring(xml_string.encode('utf-8'))
 468
 469     def _download_json(self, url_or_request, video_id,
 470                        note='Downloading JSON metadata',
 471                        errnote='Unable to download JSON metadata',
 472                        transform_source=None,
 473                        fatal=True, encoding=None):
 474         json_string = self._download_webpage(
 475             url_or_request, video_id, note, errnote, fatal=fatal,
 476             encoding=encoding)
 477         if (not fatal) and json_string is False:
 478             return None
 479         return self._parse_json(
 480             json_string, video_id, transform_source=transform_source, fatal=fatal)
 481
 482     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 483         if transform_source:
 484             json_string = transform_source(json_string)
 485         try:
 486             return json.loads(json_string)
 487         except ValueError as ve:
 488             errmsg = '%s: Failed to parse JSON ' % video_id
 489             if fatal:
 490                 raise ExtractorError(errmsg, cause=ve)
 491             else:
 492                 self.report_warning(errmsg + str(ve))
 493
 494     def report_warning(self, msg, video_id=None):
 495         idstr = '' if video_id is None else '%s: ' % video_id
 496         self._downloader.report_warning(
 497             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 498
 499     def to_screen(self, msg):
 500         """Print msg to screen, prefixing it with '[ie_name]'"""
 501         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 502
 503     def report_extraction(self, id_or_name):
 504         """Report information extraction."""
 505         self.to_screen('%s: Extracting information' % id_or_name)
 506
 507     def report_download_webpage(self, video_id):
 508         """Report webpage download."""
 509         self.to_screen('%s: Downloading webpage' % video_id)
 510
 511     def report_age_confirmation(self):
 512         """Report attempt to confirm age."""
 513         self.to_screen('Confirming age')
 514
 515     def report_login(self):
 516         """Report attempt to log in."""
 517         self.to_screen('Logging in')
 518
 519     @staticmethod
 520     def raise_login_required(msg='This video is only available for registered users'):
 521         raise ExtractorError(
 522             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 523             expected=True)
 524
 525     @staticmethod
 526     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 527         raise ExtractorError(
 528             '%s. You might want to use --proxy to workaround.' % msg,
 529             expected=True)
 530
 531     # Methods for following #608
 532     @staticmethod
 533     def url_result(url, ie=None, video_id=None, video_title=None):
 534         """Returns a URL that points to a page that should be processed"""
 535         # TODO: ie should be the class used for getting the info
 536         video_info = {'_type': 'url',
 537                       'url': url,
 538                       'ie_key': ie}
 539         if video_id is not None:
 540             video_info['id'] = video_id
 541         if video_title is not None:
 542             video_info['title'] = video_title
 543         return video_info
 544
 545     @staticmethod
 546     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 547         """Returns a playlist"""
 548         video_info = {'_type': 'playlist',
 549                       'entries': entries}
 550         if playlist_id:
 551             video_info['id'] = playlist_id
 552         if playlist_title:
 553             video_info['title'] = playlist_title
 554         if playlist_description:
 555             video_info['description'] = playlist_description
 556         return video_info
 557
 558     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 559         """
 560         Perform a regex search on the given string, using a single or a list of
 561         patterns returning the first matching group.
 562         In case of failure return a default value or raise a WARNING or a
 563         RegexNotFoundError, depending on fatal, specifying the field name.
 564         """
 565         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 566             mobj = re.search(pattern, string, flags)
 567         else:
 568             for p in pattern:
 569                 mobj = re.search(p, string, flags)
 570                 if mobj:
 571                     break
 572
 573         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 574             _name = '\033[0;34m%s\033[0m' % name
 575         else:
 576             _name = name
 577
 578         if mobj:
 579             if group is None:
 580                 # return the first matching group
 581                 return next(g for g in mobj.groups() if g is not None)
 582             else:
 583                 return mobj.group(group)
 584         elif default is not NO_DEFAULT:
 585             return default
 586         elif fatal:
 587             raise RegexNotFoundError('Unable to extract %s' % _name)
 588         else:
 589             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 590             return None
 591
 592     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Like _search_regex, but strips HTML tags and unescapes entities.
 595         """
 596         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 597         if res:
 598             return clean_html(res).strip()
 599         else:
 600             return res
 601
 602     def _get_login_info(self):
 603         """
 604         Get the login info as (username, password)
 605         It will look in the netrc file using the _NETRC_MACHINE value
 606         If there's no info available, return (None, None)
 607         """
 608         if self._downloader is None:
 609             return (None, None)
 610
 611         username = None
 612         password = None
 613         downloader_params = self._downloader.params
 614
 615         # Attempt to use provided username and password or .netrc data
 616         if downloader_params.get('username', None) is not None:
 617             username = downloader_params['username']
 618             password = downloader_params['password']
 619         elif downloader_params.get('usenetrc', False):
 620             try:
 621                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 622                 if info is not None:
 623                     username = info[0]
 624                     password = info[2]
 625                 else:
 626                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 627             except (IOError, netrc.NetrcParseError) as err:
 628                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 629
 630         return (username, password)
 631
 632     def _get_tfa_info(self, note='two-factor verification code'):
 633         """
 634         Get the two-factor authentication info
 635         TODO - asking the user will be required for sms/phone verify
 636         currently just uses the command line option
 637         If there's no info available, return None
 638         """
 639         if self._downloader is None:
 640             return None
 641         downloader_params = self._downloader.params
 642
 643         if downloader_params.get('twofactor', None) is not None:
 644             return downloader_params['twofactor']
 645
 646         return compat_getpass('Type %s and press [Return]: ' % note)
 647
 648     # Helper functions for extracting OpenGraph info
 649     @staticmethod
 650     def _og_regexes(prop):
 651         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 652         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 653                        % {'prop': re.escape(prop)})
 654         template = r'<meta[^>]+?%s[^>]+?%s'
 655         return [
 656             template % (property_re, content_re),
 657             template % (content_re, property_re),
 658         ]
 659
 660     @staticmethod
 661     def _meta_regex(prop):
 662         return r'''(?isx)<meta
 663                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 664                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 665
 666     def _og_search_property(self, prop, html, name=None, **kargs):
 667         if name is None:
 668             name = 'OpenGraph %s' % prop
 669         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 670         if escaped is None:
 671             return None
 672         return unescapeHTML(escaped)
 673
 674     def _og_search_thumbnail(self, html, **kargs):
 675         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 676
 677     def _og_search_description(self, html, **kargs):
 678         return self._og_search_property('description', html, fatal=False, **kargs)
 679
 680     def _og_search_title(self, html, **kargs):
 681         return self._og_search_property('title', html, **kargs)
 682
 683     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 684         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 685         if secure:
 686             regexes = self._og_regexes('video:secure_url') + regexes
 687         return self._html_search_regex(regexes, html, name, **kargs)
 688
 689     def _og_search_url(self, html, **kargs):
 690         return self._og_search_property('url', html, **kargs)
 691
 692     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 693         if display_name is None:
 694             display_name = name
 695         return self._html_search_regex(
 696             self._meta_regex(name),
 697             html, display_name, fatal=fatal, group='content', **kwargs)
 698
 699     def _dc_search_uploader(self, html):
 700         return self._html_search_meta('dc.creator', html, 'uploader')
 701
 702     def _rta_search(self, html):
 703         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 704         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 705                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 706                      html):
 707             return 18
 708         return 0
 709
 710     def _media_rating_search(self, html):
 711         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 712         rating = self._html_search_meta('rating', html)
 713
 714         if not rating:
 715             return None
 716
 717         RATING_TABLE = {
 718             'safe for kids': 0,
 719             'general': 8,
 720             '14 years': 14,
 721             'mature': 17,
 722             'restricted': 19,
 723         }
 724         return RATING_TABLE.get(rating.lower(), None)
 725
 726     def _family_friendly_search(self, html):
 727         # See http://schema.org/VideoObject
 728         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 729
 730         if not family_friendly:
 731             return None
 732
 733         RATING_TABLE = {
 734             '1': 0,
 735             'true': 0,
 736             '0': 18,
 737             'false': 18,
 738         }
 739         return RATING_TABLE.get(family_friendly.lower(), None)
 740
 741     def _twitter_search_player(self, html):
 742         return self._html_search_meta('twitter:player', html,
 743                                       'twitter card player')
 744
 745     @staticmethod
 746     def _hidden_inputs(html):
 747         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 748         hidden_inputs = {}
 749         for input in re.findall(r'(?i)<input([^>]+)>', html):
 750             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 751                 continue
 752             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 753             if not name:
 754                 continue
 755             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 756             if not value:
 757                 continue
 758             hidden_inputs[name.group('value')] = value.group('value')
 759         return hidden_inputs
 760
 761     def _form_hidden_inputs(self, form_id, html):
 762         form = self._search_regex(
 763             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 764             html, '%s form' % form_id, group='form')
 765         return self._hidden_inputs(form)
 766
 767     def _sort_formats(self, formats, field_preference=None):
 768         if not formats:
 769             raise ExtractorError('No video formats found')
 770
 771         def _formats_key(f):
 772             # TODO remove the following workaround
 773             from ..utils import determine_ext
 774             if not f.get('ext') and 'url' in f:
 775                 f['ext'] = determine_ext(f['url'])
 776
 777             if isinstance(field_preference, (list, tuple)):
 778                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 779
 780             preference = f.get('preference')
 781             if preference is None:
 782                 preference = 0
 783                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 784                     preference -= 0.5
 785
 786             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 787
 788             if f.get('vcodec') == 'none':  # audio only
 789                 if self._downloader.params.get('prefer_free_formats'):
 790                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 791                 else:
 792                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 793                 ext_preference = 0
 794                 try:
 795                     audio_ext_preference = ORDER.index(f['ext'])
 796                 except ValueError:
 797                     audio_ext_preference = -1
 798             else:
 799                 if self._downloader.params.get('prefer_free_formats'):
 800                     ORDER = ['flv', 'mp4', 'webm']
 801                 else:
 802                     ORDER = ['webm', 'flv', 'mp4']
 803                 try:
 804                     ext_preference = ORDER.index(f['ext'])
 805                 except ValueError:
 806                     ext_preference = -1
 807                 audio_ext_preference = 0
 808
 809             return (
 810                 preference,
 811                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 812                 f.get('quality') if f.get('quality') is not None else -1,
 813                 f.get('tbr') if f.get('tbr') is not None else -1,
 814                 f.get('filesize') if f.get('filesize') is not None else -1,
 815                 f.get('vbr') if f.get('vbr') is not None else -1,
 816                 f.get('height') if f.get('height') is not None else -1,
 817                 f.get('width') if f.get('width') is not None else -1,
 818                 proto_preference,
 819                 ext_preference,
 820                 f.get('abr') if f.get('abr') is not None else -1,
 821                 audio_ext_preference,
 822                 f.get('fps') if f.get('fps') is not None else -1,
 823                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 824                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 825                 f.get('format_id') if f.get('format_id') is not None else '',
 826             )
 827         formats.sort(key=_formats_key)
 828
 829     def _check_formats(self, formats, video_id):
 830         if formats:
 831             formats[:] = filter(
 832                 lambda f: self._is_valid_url(
 833                     f['url'], video_id,
 834                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 835                 formats)
 836
 837     def _is_valid_url(self, url, video_id, item='video'):
 838         url = self._proto_relative_url(url, scheme='http:')
 839         # For now assume non HTTP(S) URLs always valid
 840         if not (url.startswith('http://') or url.startswith('https://')):
 841             return True
 842         try:
 843             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 844             return True
 845         except ExtractorError as e:
 846             if isinstance(e.cause, compat_urllib_error.URLError):
 847                 self.to_screen(
 848                     '%s: %s URL is invalid, skipping' % (video_id, item))
 849                 return False
 850             raise
 851
 852     def http_scheme(self):
 853         """ Either "http:" or "https:", depending on the user's preferences """
 854         return (
 855             'http:'
 856             if self._downloader.params.get('prefer_insecure', False)
 857             else 'https:')
 858
 859     def _proto_relative_url(self, url, scheme=None):
 860         if url is None:
 861             return url
 862         if url.startswith('//'):
 863             if scheme is None:
 864                 scheme = self.http_scheme()
 865             return scheme + url
 866         else:
 867             return url
 868
 869     def _sleep(self, timeout, video_id, msg_template=None):
 870         if msg_template is None:
 871             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 872         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 873         self.to_screen(msg)
 874         time.sleep(timeout)
 875
 876     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 877                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 878                              fatal=True):
 879         manifest = self._download_xml(
 880             manifest_url, video_id, 'Downloading f4m manifest',
 881             'Unable to download f4m manifest',
 882             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 883             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 884             transform_source=transform_source,
 885             fatal=fatal)
 886
 887         if manifest is False:
 888             return []
 889
 890         formats = []
 891         manifest_version = '1.0'
 892         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 893         if not media_nodes:
 894             manifest_version = '2.0'
 895             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 896         base_url = xpath_text(
 897             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 898             'base URL', default=None)
 899         if base_url:
 900             base_url = base_url.strip()
 901         for i, media_el in enumerate(media_nodes):
 902             if manifest_version == '2.0':
 903                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 904                 if not media_url:
 905                     continue
 906                 manifest_url = (
 907                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 908                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 909                 # If media_url is itself a f4m manifest do the recursive extraction
 910                 # since bitrates in parent manifest (this one) and media_url manifest
 911                 # may differ leading to inability to resolve the format by requested
 912                 # bitrate in f4m downloader
 913                 if determine_ext(manifest_url) == 'f4m':
 914                     formats.extend(self._extract_f4m_formats(
 915                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 916                     continue
 917             tbr = int_or_none(media_el.attrib.get('bitrate'))
 918             formats.append({
 919                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 920                 'url': manifest_url,
 921                 'ext': 'flv',
 922                 'tbr': tbr,
 923                 'width': int_or_none(media_el.attrib.get('width')),
 924                 'height': int_or_none(media_el.attrib.get('height')),
 925                 'preference': preference,
 926             })
 927         self._sort_formats(formats)
 928
 929         return formats
 930
 931     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 932                               entry_protocol='m3u8', preference=None,
 933                               m3u8_id=None, note=None, errnote=None,
 934                               fatal=True):
 935
 936         formats = [{
 937             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 938             'url': m3u8_url,
 939             'ext': ext,
 940             'protocol': 'm3u8',
 941             'preference': preference - 1 if preference else -1,
 942             'resolution': 'multiple',
 943             'format_note': 'Quality selection URL',
 944         }]
 945
 946         format_url = lambda u: (
 947             u
 948             if re.match(r'^https?://', u)
 949             else compat_urlparse.urljoin(m3u8_url, u))
 950
 951         res = self._download_webpage_handle(
 952             m3u8_url, video_id,
 953             note=note or 'Downloading m3u8 information',
 954             errnote=errnote or 'Failed to download m3u8 information',
 955             fatal=fatal)
 956         if res is False:
 957             return []
 958         m3u8_doc, urlh = res
 959         m3u8_url = urlh.geturl()
 960         last_info = None
 961         last_media = None
 962         kv_rex = re.compile(
 963             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 964         for line in m3u8_doc.splitlines():
 965             if line.startswith('#EXT-X-STREAM-INF:'):
 966                 last_info = {}
 967                 for m in kv_rex.finditer(line):
 968                     v = m.group('val')
 969                     if v.startswith('"'):
 970                         v = v[1:-1]
 971                     last_info[m.group('key')] = v
 972             elif line.startswith('#EXT-X-MEDIA:'):
 973                 last_media = {}
 974                 for m in kv_rex.finditer(line):
 975                     v = m.group('val')
 976                     if v.startswith('"'):
 977                         v = v[1:-1]
 978                     last_media[m.group('key')] = v
 979             elif line.startswith('#') or not line.strip():
 980                 continue
 981             else:
 982                 if last_info is None:
 983                     formats.append({'url': format_url(line)})
 984                     continue
 985                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 986                 format_id = []
 987                 if m3u8_id:
 988                     format_id.append(m3u8_id)
 989                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 990                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 991                 f = {
 992                     'format_id': '-'.join(format_id),
 993                     'url': format_url(line.strip()),
 994                     'tbr': tbr,
 995                     'ext': ext,
 996                     'protocol': entry_protocol,
 997                     'preference': preference,
 998                 }
 999                 codecs = last_info.get('CODECS')
1000                 if codecs:
1001                     # TODO: looks like video codec is not always necessarily goes first
1002                     va_codecs = codecs.split(',')
1003                     if va_codecs[0]:
1004                         f['vcodec'] = va_codecs[0].partition('.')[0]
1005                     if len(va_codecs) > 1 and va_codecs[1]:
1006                         f['acodec'] = va_codecs[1].partition('.')[0]
1007                 resolution = last_info.get('RESOLUTION')
1008                 if resolution:
1009                     width_str, height_str = resolution.split('x')
1010                     f['width'] = int(width_str)
1011                     f['height'] = int(height_str)
1012                 if last_media is not None:
1013                     f['m3u8_media'] = last_media
1014                     last_media = None
1015                 formats.append(f)
1016                 last_info = {}
1017         self._sort_formats(formats)
1018         return formats
1019
1020     @staticmethod
1021     def _xpath_ns(path, namespace=None):
1022         if not namespace:
1023             return path
1024         out = []
1025         for c in path.split('/'):
1026             if not c or c == '.':
1027                 out.append(c)
1028             else:
1029                 out.append('{%s}%s' % (namespace, c))
1030         return '/'.join(out)
1031
1032     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1033         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1034
1035         if smil is False:
1036             assert not fatal
1037             return []
1038
1039         namespace = self._parse_smil_namespace(smil)
1040
1041         return self._parse_smil_formats(
1042             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1043
1044     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1045         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1046         if smil is False:
1047             return {}
1048         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1049
1050     def _download_smil(self, smil_url, video_id, fatal=True):
1051         return self._download_xml(
1052             smil_url, video_id, 'Downloading SMIL file',
1053             'Unable to download SMIL file', fatal=fatal)
1054
1055     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1056         namespace = self._parse_smil_namespace(smil)
1057
1058         formats = self._parse_smil_formats(
1059             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1060         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1061
1062         video_id = os.path.splitext(url_basename(smil_url))[0]
1063         title = None
1064         description = None
1065         upload_date = None
1066         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1067             name = meta.attrib.get('name')
1068             content = meta.attrib.get('content')
1069             if not name or not content:
1070                 continue
1071             if not title and name == 'title':
1072                 title = content
1073             elif not description and name in ('description', 'abstract'):
1074                 description = content
1075             elif not upload_date and name == 'date':
1076                 upload_date = unified_strdate(content)
1077
1078         thumbnails = [{
1079             'id': image.get('type'),
1080             'url': image.get('src'),
1081             'width': int_or_none(image.get('width')),
1082             'height': int_or_none(image.get('height')),
1083         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1084
1085         return {
1086             'id': video_id,
1087             'title': title or video_id,
1088             'description': description,
1089             'upload_date': upload_date,
1090             'thumbnails': thumbnails,
1091             'formats': formats,
1092             'subtitles': subtitles,
1093         }
1094
1095     def _parse_smil_namespace(self, smil):
1096         return self._search_regex(
1097             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1098
1099     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1100         base = smil_url
1101         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1102             b = meta.get('base') or meta.get('httpBase')
1103             if b:
1104                 base = b
1105                 break
1106
1107         formats = []
1108         rtmp_count = 0
1109         http_count = 0
1110
1111         videos = smil.findall(self._xpath_ns('.//video', namespace))
1112         for video in videos:
1113             src = video.get('src')
1114             if not src:
1115                 continue
1116
1117             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1118             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1119             width = int_or_none(video.get('width'))
1120             height = int_or_none(video.get('height'))
1121             proto = video.get('proto')
1122             ext = video.get('ext')
1123             src_ext = determine_ext(src)
1124             streamer = video.get('streamer') or base
1125
1126             if proto == 'rtmp' or streamer.startswith('rtmp'):
1127                 rtmp_count += 1
1128                 formats.append({
1129                     'url': streamer,
1130                     'play_path': src,
1131                     'ext': 'flv',
1132                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1133                     'tbr': bitrate,
1134                     'filesize': filesize,
1135                     'width': width,
1136                     'height': height,
1137                 })
1138                 if transform_rtmp_url:
1139                     streamer, src = transform_rtmp_url(streamer, src)
1140                     formats[-1].update({
1141                         'url': streamer,
1142                         'play_path': src,
1143                     })
1144                 continue
1145
1146             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1147
1148             if proto == 'm3u8' or src_ext == 'm3u8':
1149                 formats.extend(self._extract_m3u8_formats(
1150                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
1151                 continue
1152
1153             if src_ext == 'f4m':
1154                 f4m_url = src_url
1155                 if not f4m_params:
1156                     f4m_params = {
1157                         'hdcore': '3.2.0',
1158                         'plugin': 'flowplayer-3.2.0.1',
1159                     }
1160                 f4m_url += '&' if '?' in f4m_url else '?'
1161                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1162                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1163                 continue
1164
1165             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1166                 http_count += 1
1167                 formats.append({
1168                     'url': src_url,
1169                     'ext': ext or src_ext or 'flv',
1170                     'format_id': 'http-%d' % (bitrate or http_count),
1171                     'tbr': bitrate,
1172                     'filesize': filesize,
1173                     'width': width,
1174                     'height': height,
1175                 })
1176                 continue
1177
1178         self._sort_formats(formats)
1179
1180         return formats
1181
1182     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1183         subtitles = {}
1184         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1185             src = textstream.get('src')
1186             if not src:
1187                 continue
1188             ext = textstream.get('ext') or determine_ext(src)
1189             if not ext:
1190                 type_ = textstream.get('type')
1191                 SUBTITLES_TYPES = {
1192                     'text/vtt': 'vtt',
1193                     'text/srt': 'srt',
1194                     'application/smptett+xml': 'tt',
1195                 }
1196                 if type_ in SUBTITLES_TYPES:
1197                     ext = SUBTITLES_TYPES[type_]
1198             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1199             subtitles.setdefault(lang, []).append({
1200                 'url': src,
1201                 'ext': ext,
1202             })
1203         return subtitles
1204
1205     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1206         xspf = self._download_xml(
1207             playlist_url, playlist_id, 'Downloading xpsf playlist',
1208             'Unable to download xspf manifest', fatal=fatal)
1209         if xspf is False:
1210             return []
1211         return self._parse_xspf(xspf, playlist_id)
1212
1213     def _parse_xspf(self, playlist, playlist_id):
1214         NS_MAP = {
1215             'xspf': 'http://xspf.org/ns/0/',
1216             's1': 'http://static.streamone.nl/player/ns/0',
1217         }
1218
1219         entries = []
1220         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1221             title = xpath_text(
1222                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1223             description = xpath_text(
1224                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1225             thumbnail = xpath_text(
1226                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1227             duration = float_or_none(
1228                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1229
1230             formats = [{
1231                 'url': location.text,
1232                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1233                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1234                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1235             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1236             self._sort_formats(formats)
1237
1238             entries.append({
1239                 'id': playlist_id,
1240                 'title': title,
1241                 'description': description,
1242                 'thumbnail': thumbnail,
1243                 'duration': duration,
1244                 'formats': formats,
1245             })
1246         return entries
1247
1248     def _live_title(self, name):
1249         """ Generate the title for a live video """
1250         now = datetime.datetime.now()
1251         now_str = now.strftime("%Y-%m-%d %H:%M")
1252         return name + ' ' + now_str
1253
1254     def _int(self, v, name, fatal=False, **kwargs):
1255         res = int_or_none(v, **kwargs)
1256         if 'get_attr' in kwargs:
1257             print(getattr(v, kwargs['get_attr']))
1258         if res is None:
1259             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1260             if fatal:
1261                 raise ExtractorError(msg)
1262             else:
1263                 self._downloader.report_warning(msg)
1264         return res
1265
1266     def _float(self, v, name, fatal=False, **kwargs):
1267         res = float_or_none(v, **kwargs)
1268         if res is None:
1269             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1270             if fatal:
1271                 raise ExtractorError(msg)
1272             else:
1273                 self._downloader.report_warning(msg)
1274         return res
1275
1276     def _set_cookie(self, domain, name, value, expire_time=None):
1277         cookie = compat_cookiejar.Cookie(
1278             0, name, value, None, None, domain, None,
1279             None, '/', True, False, expire_time, '', None, None, None)
1280         self._downloader.cookiejar.set_cookie(cookie)
1281
1282     def _get_cookies(self, url):
1283         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1284         req = sanitized_Request(url)
1285         self._downloader.cookiejar.add_cookie_header(req)
1286         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1287
1288     def get_testcases(self, include_onlymatching=False):
1289         t = getattr(self, '_TEST', None)
1290         if t:
1291             assert not hasattr(self, '_TESTS'), \
1292                 '%s has _TEST and _TESTS' % type(self).__name__
1293             tests = [t]
1294         else:
1295             tests = getattr(self, '_TESTS', [])
1296         for t in tests:
1297             if not include_onlymatching and t.get('only_matching', False):
1298                 continue
1299             t['name'] = type(self).__name__[:-len('IE')]
1300             yield t
1301
1302     def is_suitable(self, age_limit):
1303         """ Test whether the extractor is generally suitable for the given
1304         age limit (i.e. pornographic sites are not, all others usually are) """
1305
1306         any_restricted = False
1307         for tc in self.get_testcases(include_onlymatching=False):
1308             if 'playlist' in tc:
1309                 tc = tc['playlist'][0]
1310             is_restricted = age_restricted(
1311                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1312             if not is_restricted:
1313                 return True
1314             any_restricted = any_restricted or is_restricted
1315         return not any_restricted
1316
1317     def extract_subtitles(self, *args, **kwargs):
1318         if (self._downloader.params.get('writesubtitles', False) or
1319                 self._downloader.params.get('listsubtitles')):
1320             return self._get_subtitles(*args, **kwargs)
1321         return {}
1322
1323     def _get_subtitles(self, *args, **kwargs):
1324         raise NotImplementedError("This method must be implemented by subclasses")
1325
1326     @staticmethod
1327     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1328         """ Merge subtitle items for one language. Items with duplicated URLs
1329         will be dropped. """
1330         list1_urls = set([item['url'] for item in subtitle_list1])
1331         ret = list(subtitle_list1)
1332         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1333         return ret
1334
1335     @classmethod
1336     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1337         """ Merge two subtitle dictionaries, language by language. """
1338         ret = dict(subtitle_dict1)
1339         for lang in subtitle_dict2:
1340             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1341         return ret
1342
1343     def extract_automatic_captions(self, *args, **kwargs):
1344         if (self._downloader.params.get('writeautomaticsub', False) or
1345                 self._downloader.params.get('listsubtitles')):
1346             return self._get_automatic_captions(*args, **kwargs)
1347         return {}
1348
1349     def _get_automatic_captions(self, *args, **kwargs):
1350         raise NotImplementedError("This method must be implemented by subclasses")
1351
1352
1353 class SearchInfoExtractor(InfoExtractor):
1354     """
1355     Base class for paged search queries extractors.
1356     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1357     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1358     """
1359
1360     @classmethod
1361     def _make_valid_url(cls):
1362         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1363
1364     @classmethod
1365     def suitable(cls, url):
1366         return re.match(cls._make_valid_url(), url) is not None
1367
1368     def _real_extract(self, query):
1369         mobj = re.match(self._make_valid_url(), query)
1370         if mobj is None:
1371             raise ExtractorError('Invalid search query "%s"' % query)
1372
1373         prefix = mobj.group('prefix')
1374         query = mobj.group('query')
1375         if prefix == '':
1376             return self._get_n_results(query, 1)
1377         elif prefix == 'all':
1378             return self._get_n_results(query, self._MAX_RESULTS)
1379         else:
1380             n = int(prefix)
1381             if n <= 0:
1382                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1383             elif n > self._MAX_RESULTS:
1384                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1385                 n = self._MAX_RESULTS
1386             return self._get_n_results(query, n)
1387
1388     def _get_n_results(self, query, n):
1389         """Get a specified number of results for a query"""
1390         raise NotImplementedError("This method must be implemented by subclasses")
1391
1392     @property
1393     def SEARCH_KEY(self):
1394         return self._SEARCH_KEY