_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13
  14 from ..compat import (
  15     compat_cookiejar,
  16     compat_cookies,
  17     compat_getpass,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse,
  21     compat_urlparse,
  22     compat_str,
  23     compat_etree_fromstring,
  24 )
  25 from ..utils import (
  26     NO_DEFAULT,
  27     age_restricted,
  28     bug_reports_message,
  29     clean_html,
  30     compiled_regex_type,
  31     determine_ext,
  32     error_to_compat_str,
  33     ExtractorError,
  34     fix_xml_ampersands,
  35     float_or_none,
  36     int_or_none,
  37     parse_iso8601,
  38     RegexNotFoundError,
  39     sanitize_filename,
  40     sanitized_Request,
  41     unescapeHTML,
  42     unified_strdate,
  43     url_basename,
  44     xpath_text,
  45     xpath_with_ns,
  46     determine_protocol,
  47 )
  48
  49
  50 class InfoExtractor(object):
  51     """Information Extractor class.
  52
  53     Information extractors are the classes that, given a URL, extract
  54     information about the video (or videos) the URL refers to. This
  55     information includes the real video URL, the video title, author and
  56     others. The information is stored in a dictionary which is then
  57     passed to the YoutubeDL. The YoutubeDL processes this
  58     information possibly downloading the video to the file system, among
  59     other possible outcomes.
  60
  61     The type field determines the type of the result.
  62     By far the most common value (and the default if _type is missing) is
  63     "video", which indicates a single video.
  64
  65     For a video, the dictionaries must include the following fields:
  66
  67     id:             Video identifier.
  68     title:          Video title, unescaped.
  69
  70     Additionally, it must contain either a formats entry or a url one:
  71
  72     formats:        A list of dictionaries for each format available, ordered
  73                     from worst to best quality.
  74
  75                     Potential fields:
  76                     * url        Mandatory. The URL of the video file
  77                     * ext        Will be calculated from URL if missing
  78                     * format     A human-readable description of the format
  79                                  ("mp4 container with h264/opus").
  80                                  Calculated from the format_id, width, height.
  81                                  and format_note fields if missing.
  82                     * format_id  A short description of the format
  83                                  ("mp4_h264_opus" or "19").
  84                                 Technically optional, but strongly recommended.
  85                     * format_note Additional info about the format
  86                                  ("3D" or "DASH video")
  87                     * width      Width of the video, if known
  88                     * height     Height of the video, if known
  89                     * resolution Textual description of width and height
  90                     * tbr        Average bitrate of audio and video in KBit/s
  91                     * abr        Average audio bitrate in KBit/s
  92                     * acodec     Name of the audio codec in use
  93                     * asr        Audio sampling rate in Hertz
  94                     * vbr        Average video bitrate in KBit/s
  95                     * fps        Frame rate
  96                     * vcodec     Name of the video codec in use
  97                     * container  Name of the container format
  98                     * filesize   The number of bytes, if known in advance
  99                     * filesize_approx  An estimate for the number of bytes
 100                     * player_url SWF Player URL (used for rtmpdump).
 101                     * protocol   The protocol that will be used for the actual
 102                                  download, lower-case.
 103                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 104                                  "m3u8", or "m3u8_native".
 105                     * preference Order number of this format. If this field is
 106                                  present and not None, the formats get sorted
 107                                  by this field, regardless of all other values.
 108                                  -1 for default (order by other properties),
 109                                  -2 or smaller for less than default.
 110                                  < -1000 to hide the format (if there is
 111                                     another one which is strictly better)
 112                     * language   Language code, e.g. "de" or "en-US".
 113                     * language_preference  Is this in the language mentioned in
 114                                  the URL?
 115                                  10 if it's what the URL is about,
 116                                  -1 for default (don't know),
 117                                  -10 otherwise, other values reserved for now.
 118                     * quality    Order number of the video quality of this
 119                                  format, irrespective of the file format.
 120                                  -1 for default (order by other properties),
 121                                  -2 or smaller for less than default.
 122                     * source_preference  Order number for this video source
 123                                   (quality takes higher priority)
 124                                  -1 for default (order by other properties),
 125                                  -2 or smaller for less than default.
 126                     * http_headers  A dictionary of additional HTTP headers
 127                                  to add to the request.
 128                     * stretched_ratio  If given and not 1, indicates that the
 129                                  video's pixels are not square.
 130                                  width : height ratio as float.
 131                     * no_resume  The server does not support resuming the
 132                                  (HTTP or RTMP) download. Boolean.
 133
 134     url:            Final video URL.
 135     ext:            Video filename extension.
 136     format:         The video format, defaults to ext (used for --get-format)
 137     player_url:     SWF Player URL (used for rtmpdump).
 138
 139     The following fields are optional:
 140
 141     alt_title:      A secondary title of the video.
 142     display_id      An alternative identifier for the video, not necessarily
 143                     unique, but available before title. Typically, id is
 144                     something like "4234987", title "Dancing naked mole rats",
 145                     and display_id "dancing-naked-mole-rats"
 146     thumbnails:     A list of dictionaries, with the following entries:
 147                         * "id" (optional, string) - Thumbnail format ID
 148                         * "url"
 149                         * "preference" (optional, int) - quality of the image
 150                         * "width" (optional, int)
 151                         * "height" (optional, int)
 152                         * "resolution" (optional, string "{width}x{height"},
 153                                         deprecated)
 154     thumbnail:      Full URL to a video thumbnail image.
 155     description:    Full video description.
 156     uploader:       Full name of the video uploader.
 157     creator:        The main artist who created the video.
 158     release_date:   The date (YYYYMMDD) when the video was released.
 159     timestamp:      UNIX timestamp of the moment the video became available.
 160     upload_date:    Video upload date (YYYYMMDD).
 161                     If not explicitly set, calculated from timestamp.
 162     uploader_id:    Nickname or id of the video uploader.
 163     location:       Physical location where the video was filmed.
 164     subtitles:      The available subtitles as a dictionary in the format
 165                     {language: subformats}. "subformats" is a list sorted from
 166                     lower to higher preference, each element is a dictionary
 167                     with the "ext" entry and one of:
 168                         * "data": The subtitles file contents
 169                         * "url": A URL pointing to the subtitles file
 170                     "ext" will be calculated from URL if missing
 171     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 172                     automatically generated captions
 173     duration:       Length of the video in seconds, as an integer or float.
 174     view_count:     How many users have watched the video on the platform.
 175     like_count:     Number of positive ratings of the video
 176     dislike_count:  Number of negative ratings of the video
 177     repost_count:   Number of reposts of the video
 178     average_rating: Average rating give by users, the scale used depends on the webpage
 179     comment_count:  Number of comments on the video
 180     comments:       A list of comments, each with one or more of the following
 181                     properties (all but one of text or html optional):
 182                         * "author" - human-readable name of the comment author
 183                         * "author_id" - user ID of the comment author
 184                         * "id" - Comment ID
 185                         * "html" - Comment as HTML
 186                         * "text" - Plain text of the comment
 187                         * "timestamp" - UNIX timestamp of comment
 188                         * "parent" - ID of the comment this one is replying to.
 189                                      Set to "root" to indicate that this is a
 190                                      comment to the original video.
 191     age_limit:      Age restriction for the video, as an integer (years)
 192     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 193                     should allow to get the same result again. (It will be set
 194                     by YoutubeDL if it's missing)
 195     categories:     A list of categories that the video falls in, for example
 196                     ["Sports", "Berlin"]
 197     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 198     is_live:        True, False, or None (=unknown). Whether this video is a
 199                     live stream that goes on instead of a fixed-length video.
 200     start_time:     Time in seconds where the reproduction should start, as
 201                     specified in the URL.
 202     end_time:       Time in seconds where the reproduction should end, as
 203                     specified in the URL.
 204
 205     The following fields should only be used when the video belongs to some logical
 206     chapter or section:
 207
 208     chapter:        Name or title of the chapter the video belongs to.
 209     chapter_number: Number of the chapter the video belongs to, as an integer.
 210     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 211
 212     The following fields should only be used when the video is an episode of some
 213     series or programme:
 214
 215     series:         Title of the series or programme the video episode belongs to.
 216     season:         Title of the season the video episode belongs to.
 217     season_number:  Number of the season the video episode belongs to, as an integer.
 218     season_id:      Id of the season the video episode belongs to, as a unicode string.
 219     episode:        Title of the video episode. Unlike mandatory video title field,
 220                     this field should denote the exact title of the video episode
 221                     without any kind of decoration.
 222     episode_number: Number of the video episode within a season, as an integer.
 223     episode_id:     Id of the video episode, as a unicode string.
 224
 225     Unless mentioned otherwise, the fields should be Unicode strings.
 226
 227     Unless mentioned otherwise, None is equivalent to absence of information.
 228
 229
 230     _type "playlist" indicates multiple videos.
 231     There must be a key "entries", which is a list, an iterable, or a PagedList
 232     object, each element of which is a valid dictionary by this specification.
 233
 234     Additionally, playlists can have "title", "description" and "id" attributes
 235     with the same semantics as videos (see above).
 236
 237
 238     _type "multi_video" indicates that there are multiple videos that
 239     form a single show, for examples multiple acts of an opera or TV episode.
 240     It must have an entries key like a playlist and contain all the keys
 241     required for a video at the same time.
 242
 243
 244     _type "url" indicates that the video must be extracted from another
 245     location, possibly by a different extractor. Its only required key is:
 246     "url" - the next URL to extract.
 247     The key "ie_key" can be set to the class name (minus the trailing "IE",
 248     e.g. "Youtube") if the extractor class is known in advance.
 249     Additionally, the dictionary may have any properties of the resolved entity
 250     known in advance, for example "title" if the title of the referred video is
 251     known ahead of time.
 252
 253
 254     _type "url_transparent" entities have the same specification as "url", but
 255     indicate that the given additional information is more precise than the one
 256     associated with the resolved URL.
 257     This is useful when a site employs a video service that hosts the video and
 258     its technical metadata, but that video service does not embed a useful
 259     title, description etc.
 260
 261
 262     Subclasses of this one should re-define the _real_initialize() and
 263     _real_extract() methods and define a _VALID_URL regexp.
 264     Probably, they should also be added to the list of extractors.
 265
 266     Finally, the _WORKING attribute should be set to False for broken IEs
 267     in order to warn the users and skip the tests.
 268     """
 269
 270     _ready = False
 271     _downloader = None
 272     _WORKING = True
 273
 274     def __init__(self, downloader=None):
 275         """Constructor. Receives an optional downloader."""
 276         self._ready = False
 277         self.set_downloader(downloader)
 278
 279     @classmethod
 280     def suitable(cls, url):
 281         """Receives a URL and returns True if suitable for this IE."""
 282
 283         # This does not use has/getattr intentionally - we want to know whether
 284         # we have cached the regexp for *this* class, whereas getattr would also
 285         # match the superclass
 286         if '_VALID_URL_RE' not in cls.__dict__:
 287             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 288         return cls._VALID_URL_RE.match(url) is not None
 289
 290     @classmethod
 291     def _match_id(cls, url):
 292         if '_VALID_URL_RE' not in cls.__dict__:
 293             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 294         m = cls._VALID_URL_RE.match(url)
 295         assert m
 296         return m.group('id')
 297
 298     @classmethod
 299     def working(cls):
 300         """Getter method for _WORKING."""
 301         return cls._WORKING
 302
 303     def initialize(self):
 304         """Initializes an instance (authentication, etc)."""
 305         if not self._ready:
 306             self._real_initialize()
 307             self._ready = True
 308
 309     def extract(self, url):
 310         """Extracts URL information and returns it in list of dicts."""
 311         try:
 312             self.initialize()
 313             return self._real_extract(url)
 314         except ExtractorError:
 315             raise
 316         except compat_http_client.IncompleteRead as e:
 317             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 318         except (KeyError, StopIteration) as e:
 319             raise ExtractorError('An extractor error has occurred.', cause=e)
 320
 321     def set_downloader(self, downloader):
 322         """Sets the downloader for this IE."""
 323         self._downloader = downloader
 324
 325     def _real_initialize(self):
 326         """Real initialization process. Redefine in subclasses."""
 327         pass
 328
 329     def _real_extract(self, url):
 330         """Real extraction process. Redefine in subclasses."""
 331         pass
 332
 333     @classmethod
 334     def ie_key(cls):
 335         """A string for getting the InfoExtractor with get_info_extractor"""
 336         return compat_str(cls.__name__[:-2])
 337
 338     @property
 339     def IE_NAME(self):
 340         return compat_str(type(self).__name__[:-2])
 341
 342     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 343         """ Returns the response handle """
 344         if note is None:
 345             self.report_download_webpage(video_id)
 346         elif note is not False:
 347             if video_id is None:
 348                 self.to_screen('%s' % (note,))
 349             else:
 350                 self.to_screen('%s: %s' % (video_id, note))
 351         try:
 352             return self._downloader.urlopen(url_or_request)
 353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 354             if errnote is False:
 355                 return False
 356             if errnote is None:
 357                 errnote = 'Unable to download webpage'
 358
 359             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 360             if fatal:
 361                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 362             else:
 363                 self._downloader.report_warning(errmsg)
 364                 return False
 365
 366     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 367         """ Returns a tuple (page content as string, URL handle) """
 368         # Strip hashes from the URL (#1038)
 369         if isinstance(url_or_request, (compat_str, str)):
 370             url_or_request = url_or_request.partition('#')[0]
 371
 372         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 373         if urlh is False:
 374             assert not fatal
 375             return False
 376         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 377         return (content, urlh)
 378
 379     @staticmethod
 380     def _guess_encoding_from_content(content_type, webpage_bytes):
 381         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 382         if m:
 383             encoding = m.group(1)
 384         else:
 385             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 386                           webpage_bytes[:1024])
 387             if m:
 388                 encoding = m.group(1).decode('ascii')
 389             elif webpage_bytes.startswith(b'\xff\xfe'):
 390                 encoding = 'utf-16'
 391             else:
 392                 encoding = 'utf-8'
 393
 394         return encoding
 395
 396     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 397         content_type = urlh.headers.get('Content-Type', '')
 398         webpage_bytes = urlh.read()
 399         if prefix is not None:
 400             webpage_bytes = prefix + webpage_bytes
 401         if not encoding:
 402             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 403         if self._downloader.params.get('dump_intermediate_pages', False):
 404             try:
 405                 url = url_or_request.get_full_url()
 406             except AttributeError:
 407                 url = url_or_request
 408             self.to_screen('Dumping request to ' + url)
 409             dump = base64.b64encode(webpage_bytes).decode('ascii')
 410             self._downloader.to_screen(dump)
 411         if self._downloader.params.get('write_pages', False):
 412             try:
 413                 url = url_or_request.get_full_url()
 414             except AttributeError:
 415                 url = url_or_request
 416             basen = '%s_%s' % (video_id, url)
 417             if len(basen) > 240:
 418                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 419                 basen = basen[:240 - len(h)] + h
 420             raw_filename = basen + '.dump'
 421             filename = sanitize_filename(raw_filename, restricted=True)
 422             self.to_screen('Saving request to ' + filename)
 423             # Working around MAX_PATH limitation on Windows (see
 424             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 425             if os.name == 'nt':
 426                 absfilepath = os.path.abspath(filename)
 427                 if len(absfilepath) > 259:
 428                     filename = '\\\\?\\' + absfilepath
 429             with open(filename, 'wb') as outf:
 430                 outf.write(webpage_bytes)
 431
 432         try:
 433             content = webpage_bytes.decode(encoding, 'replace')
 434         except LookupError:
 435             content = webpage_bytes.decode('utf-8', 'replace')
 436
 437         if ('<title>Access to this site is blocked</title>' in content and
 438                 'Websense' in content[:512]):
 439             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 440             blocked_iframe = self._html_search_regex(
 441                 r'<iframe src="([^"]+)"', content,
 442                 'Websense information URL', default=None)
 443             if blocked_iframe:
 444                 msg += ' Visit %s for more details' % blocked_iframe
 445             raise ExtractorError(msg, expected=True)
 446         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 447             msg = (
 448                 'Access to this webpage has been blocked by Indian censorship. '
 449                 'Use a VPN or proxy server (with --proxy) to route around it.')
 450             block_msg = self._html_search_regex(
 451                 r'</h1><p>(.*?)</p>',
 452                 content, 'block message', default=None)
 453             if block_msg:
 454                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 455             raise ExtractorError(msg, expected=True)
 456
 457         return content
 458
 459     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 460         """ Returns the data of the page as a string """
 461         success = False
 462         try_count = 0
 463         while success is False:
 464             try:
 465                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 466                 success = True
 467             except compat_http_client.IncompleteRead as e:
 468                 try_count += 1
 469                 if try_count >= tries:
 470                     raise e
 471                 self._sleep(timeout, video_id)
 472         if res is False:
 473             return res
 474         else:
 475             content, _ = res
 476             return content
 477
 478     def _download_xml(self, url_or_request, video_id,
 479                       note='Downloading XML', errnote='Unable to download XML',
 480                       transform_source=None, fatal=True, encoding=None):
 481         """Return the xml as an xml.etree.ElementTree.Element"""
 482         xml_string = self._download_webpage(
 483             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 484         if xml_string is False:
 485             return xml_string
 486         if transform_source:
 487             xml_string = transform_source(xml_string)
 488         return compat_etree_fromstring(xml_string.encode('utf-8'))
 489
 490     def _download_json(self, url_or_request, video_id,
 491                        note='Downloading JSON metadata',
 492                        errnote='Unable to download JSON metadata',
 493                        transform_source=None,
 494                        fatal=True, encoding=None):
 495         json_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal,
 497             encoding=encoding)
 498         if (not fatal) and json_string is False:
 499             return None
 500         return self._parse_json(
 501             json_string, video_id, transform_source=transform_source, fatal=fatal)
 502
 503     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 504         if transform_source:
 505             json_string = transform_source(json_string)
 506         try:
 507             return json.loads(json_string)
 508         except ValueError as ve:
 509             errmsg = '%s: Failed to parse JSON ' % video_id
 510             if fatal:
 511                 raise ExtractorError(errmsg, cause=ve)
 512             else:
 513                 self.report_warning(errmsg + str(ve))
 514
 515     def report_warning(self, msg, video_id=None):
 516         idstr = '' if video_id is None else '%s: ' % video_id
 517         self._downloader.report_warning(
 518             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 519
 520     def to_screen(self, msg):
 521         """Print msg to screen, prefixing it with '[ie_name]'"""
 522         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 523
 524     def report_extraction(self, id_or_name):
 525         """Report information extraction."""
 526         self.to_screen('%s: Extracting information' % id_or_name)
 527
 528     def report_download_webpage(self, video_id):
 529         """Report webpage download."""
 530         self.to_screen('%s: Downloading webpage' % video_id)
 531
 532     def report_age_confirmation(self):
 533         """Report attempt to confirm age."""
 534         self.to_screen('Confirming age')
 535
 536     def report_login(self):
 537         """Report attempt to log in."""
 538         self.to_screen('Logging in')
 539
 540     @staticmethod
 541     def raise_login_required(msg='This video is only available for registered users'):
 542         raise ExtractorError(
 543             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 544             expected=True)
 545
 546     @staticmethod
 547     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 548         raise ExtractorError(
 549             '%s. You might want to use --proxy to workaround.' % msg,
 550             expected=True)
 551
 552     # Methods for following #608
 553     @staticmethod
 554     def url_result(url, ie=None, video_id=None, video_title=None):
 555         """Returns a URL that points to a page that should be processed"""
 556         # TODO: ie should be the class used for getting the info
 557         video_info = {'_type': 'url',
 558                       'url': url,
 559                       'ie_key': ie}
 560         if video_id is not None:
 561             video_info['id'] = video_id
 562         if video_title is not None:
 563             video_info['title'] = video_title
 564         return video_info
 565
 566     @staticmethod
 567     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 568         """Returns a playlist"""
 569         video_info = {'_type': 'playlist',
 570                       'entries': entries}
 571         if playlist_id:
 572             video_info['id'] = playlist_id
 573         if playlist_title:
 574             video_info['title'] = playlist_title
 575         if playlist_description:
 576             video_info['description'] = playlist_description
 577         return video_info
 578
 579     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 580         """
 581         Perform a regex search on the given string, using a single or a list of
 582         patterns returning the first matching group.
 583         In case of failure return a default value or raise a WARNING or a
 584         RegexNotFoundError, depending on fatal, specifying the field name.
 585         """
 586         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 587             mobj = re.search(pattern, string, flags)
 588         else:
 589             for p in pattern:
 590                 mobj = re.search(p, string, flags)
 591                 if mobj:
 592                     break
 593
 594         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 595             _name = '\033[0;34m%s\033[0m' % name
 596         else:
 597             _name = name
 598
 599         if mobj:
 600             if group is None:
 601                 # return the first matching group
 602                 return next(g for g in mobj.groups() if g is not None)
 603             else:
 604                 return mobj.group(group)
 605         elif default is not NO_DEFAULT:
 606             return default
 607         elif fatal:
 608             raise RegexNotFoundError('Unable to extract %s' % _name)
 609         else:
 610             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 611             return None
 612
 613     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 614         """
 615         Like _search_regex, but strips HTML tags and unescapes entities.
 616         """
 617         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 618         if res:
 619             return clean_html(res).strip()
 620         else:
 621             return res
 622
 623     def _get_login_info(self):
 624         """
 625         Get the login info as (username, password)
 626         It will look in the netrc file using the _NETRC_MACHINE value
 627         If there's no info available, return (None, None)
 628         """
 629         if self._downloader is None:
 630             return (None, None)
 631
 632         username = None
 633         password = None
 634         downloader_params = self._downloader.params
 635
 636         # Attempt to use provided username and password or .netrc data
 637         if downloader_params.get('username', None) is not None:
 638             username = downloader_params['username']
 639             password = downloader_params['password']
 640         elif downloader_params.get('usenetrc', False):
 641             try:
 642                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 643                 if info is not None:
 644                     username = info[0]
 645                     password = info[2]
 646                 else:
 647                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 648             except (IOError, netrc.NetrcParseError) as err:
 649                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 650
 651         return (username, password)
 652
 653     def _get_tfa_info(self, note='two-factor verification code'):
 654         """
 655         Get the two-factor authentication info
 656         TODO - asking the user will be required for sms/phone verify
 657         currently just uses the command line option
 658         If there's no info available, return None
 659         """
 660         if self._downloader is None:
 661             return None
 662         downloader_params = self._downloader.params
 663
 664         if downloader_params.get('twofactor', None) is not None:
 665             return downloader_params['twofactor']
 666
 667         return compat_getpass('Type %s and press [Return]: ' % note)
 668
 669     # Helper functions for extracting OpenGraph info
 670     @staticmethod
 671     def _og_regexes(prop):
 672         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 673         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 674                        % {'prop': re.escape(prop)})
 675         template = r'<meta[^>]+?%s[^>]+?%s'
 676         return [
 677             template % (property_re, content_re),
 678             template % (content_re, property_re),
 679         ]
 680
 681     @staticmethod
 682     def _meta_regex(prop):
 683         return r'''(?isx)<meta
 684                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 685                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 686
 687     def _og_search_property(self, prop, html, name=None, **kargs):
 688         if name is None:
 689             name = 'OpenGraph %s' % prop
 690         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 691         if escaped is None:
 692             return None
 693         return unescapeHTML(escaped)
 694
 695     def _og_search_thumbnail(self, html, **kargs):
 696         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 697
 698     def _og_search_description(self, html, **kargs):
 699         return self._og_search_property('description', html, fatal=False, **kargs)
 700
 701     def _og_search_title(self, html, **kargs):
 702         return self._og_search_property('title', html, **kargs)
 703
 704     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 705         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 706         if secure:
 707             regexes = self._og_regexes('video:secure_url') + regexes
 708         return self._html_search_regex(regexes, html, name, **kargs)
 709
 710     def _og_search_url(self, html, **kargs):
 711         return self._og_search_property('url', html, **kargs)
 712
 713     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 714         if display_name is None:
 715             display_name = name
 716         return self._html_search_regex(
 717             self._meta_regex(name),
 718             html, display_name, fatal=fatal, group='content', **kwargs)
 719
 720     def _dc_search_uploader(self, html):
 721         return self._html_search_meta('dc.creator', html, 'uploader')
 722
 723     def _rta_search(self, html):
 724         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 725         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 726                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 727                      html):
 728             return 18
 729         return 0
 730
 731     def _media_rating_search(self, html):
 732         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 733         rating = self._html_search_meta('rating', html)
 734
 735         if not rating:
 736             return None
 737
 738         RATING_TABLE = {
 739             'safe for kids': 0,
 740             'general': 8,
 741             '14 years': 14,
 742             'mature': 17,
 743             'restricted': 19,
 744         }
 745         return RATING_TABLE.get(rating.lower(), None)
 746
 747     def _family_friendly_search(self, html):
 748         # See http://schema.org/VideoObject
 749         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 750
 751         if not family_friendly:
 752             return None
 753
 754         RATING_TABLE = {
 755             '1': 0,
 756             'true': 0,
 757             '0': 18,
 758             'false': 18,
 759         }
 760         return RATING_TABLE.get(family_friendly.lower(), None)
 761
 762     def _twitter_search_player(self, html):
 763         return self._html_search_meta('twitter:player', html,
 764                                       'twitter card player')
 765
 766     def _search_json_ld(self, html, video_id, **kwargs):
 767         json_ld = self._search_regex(
 768             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 769             html, 'JSON-LD', group='json_ld', **kwargs)
 770         if not json_ld:
 771             return {}
 772         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 773
 774     def _json_ld(self, json_ld, video_id, fatal=True):
 775         if isinstance(json_ld, compat_str):
 776             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 777         if not json_ld:
 778             return {}
 779         info = {}
 780         if json_ld.get('@context') == 'http://schema.org':
 781             item_type = json_ld.get('@type')
 782             if item_type == 'TVEpisode':
 783                 info.update({
 784                     'episode': unescapeHTML(json_ld.get('name')),
 785                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 786                     'description': unescapeHTML(json_ld.get('description')),
 787                 })
 788                 part_of_season = json_ld.get('partOfSeason')
 789                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 790                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 791                 part_of_series = json_ld.get('partOfSeries')
 792                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 793                     info['series'] = unescapeHTML(part_of_series.get('name'))
 794             elif item_type == 'Article':
 795                 info.update({
 796                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 797                     'title': unescapeHTML(json_ld.get('headline')),
 798                     'description': unescapeHTML(json_ld.get('articleBody')),
 799                 })
 800         return dict((k, v) for k, v in info.items() if v is not None)
 801
 802     @staticmethod
 803     def _hidden_inputs(html):
 804         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 805         hidden_inputs = {}
 806         for input in re.findall(r'(?i)<input([^>]+)>', html):
 807             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 808                 continue
 809             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 810             if not name:
 811                 continue
 812             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 813             if not value:
 814                 continue
 815             hidden_inputs[name.group('value')] = value.group('value')
 816         return hidden_inputs
 817
 818     def _form_hidden_inputs(self, form_id, html):
 819         form = self._search_regex(
 820             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 821             html, '%s form' % form_id, group='form')
 822         return self._hidden_inputs(form)
 823
 824     def _sort_formats(self, formats, field_preference=None):
 825         if not formats:
 826             raise ExtractorError('No video formats found')
 827
 828         def _formats_key(f):
 829             # TODO remove the following workaround
 830             from ..utils import determine_ext
 831             if not f.get('ext') and 'url' in f:
 832                 f['ext'] = determine_ext(f['url'])
 833
 834             if isinstance(field_preference, (list, tuple)):
 835                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 836
 837             preference = f.get('preference')
 838             if preference is None:
 839                 preference = 0
 840                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 841                     preference -= 0.5
 842
 843             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 844
 845             if f.get('vcodec') == 'none':  # audio only
 846                 if self._downloader.params.get('prefer_free_formats'):
 847                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 848                 else:
 849                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 850                 ext_preference = 0
 851                 try:
 852                     audio_ext_preference = ORDER.index(f['ext'])
 853                 except ValueError:
 854                     audio_ext_preference = -1
 855             else:
 856                 if self._downloader.params.get('prefer_free_formats'):
 857                     ORDER = ['flv', 'mp4', 'webm']
 858                 else:
 859                     ORDER = ['webm', 'flv', 'mp4']
 860                 try:
 861                     ext_preference = ORDER.index(f['ext'])
 862                 except ValueError:
 863                     ext_preference = -1
 864                 audio_ext_preference = 0
 865
 866             return (
 867                 preference,
 868                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 869                 f.get('quality') if f.get('quality') is not None else -1,
 870                 f.get('tbr') if f.get('tbr') is not None else -1,
 871                 f.get('filesize') if f.get('filesize') is not None else -1,
 872                 f.get('vbr') if f.get('vbr') is not None else -1,
 873                 f.get('height') if f.get('height') is not None else -1,
 874                 f.get('width') if f.get('width') is not None else -1,
 875                 proto_preference,
 876                 ext_preference,
 877                 f.get('abr') if f.get('abr') is not None else -1,
 878                 audio_ext_preference,
 879                 f.get('fps') if f.get('fps') is not None else -1,
 880                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 881                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 882                 f.get('format_id') if f.get('format_id') is not None else '',
 883             )
 884         formats.sort(key=_formats_key)
 885
 886     def _check_formats(self, formats, video_id):
 887         if formats:
 888             formats[:] = filter(
 889                 lambda f: self._is_valid_url(
 890                     f['url'], video_id,
 891                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 892                 formats)
 893
 894     def _is_valid_url(self, url, video_id, item='video'):
 895         url = self._proto_relative_url(url, scheme='http:')
 896         # For now assume non HTTP(S) URLs always valid
 897         if not (url.startswith('http://') or url.startswith('https://')):
 898             return True
 899         try:
 900             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 901             return True
 902         except ExtractorError as e:
 903             if isinstance(e.cause, compat_urllib_error.URLError):
 904                 self.to_screen(
 905                     '%s: %s URL is invalid, skipping' % (video_id, item))
 906                 return False
 907             raise
 908
 909     def http_scheme(self):
 910         """ Either "http:" or "https:", depending on the user's preferences """
 911         return (
 912             'http:'
 913             if self._downloader.params.get('prefer_insecure', False)
 914             else 'https:')
 915
 916     def _proto_relative_url(self, url, scheme=None):
 917         if url is None:
 918             return url
 919         if url.startswith('//'):
 920             if scheme is None:
 921                 scheme = self.http_scheme()
 922             return scheme + url
 923         else:
 924             return url
 925
 926     def _sleep(self, timeout, video_id, msg_template=None):
 927         if msg_template is None:
 928             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 929         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 930         self.to_screen(msg)
 931         time.sleep(timeout)
 932
 933     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 934                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 935                              fatal=True):
 936         manifest = self._download_xml(
 937             manifest_url, video_id, 'Downloading f4m manifest',
 938             'Unable to download f4m manifest',
 939             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 940             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 941             transform_source=transform_source,
 942             fatal=fatal)
 943
 944         if manifest is False:
 945             return []
 946
 947         formats = []
 948         manifest_version = '1.0'
 949         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 950         if not media_nodes:
 951             manifest_version = '2.0'
 952             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 953         base_url = xpath_text(
 954             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 955             'base URL', default=None)
 956         if base_url:
 957             base_url = base_url.strip()
 958         for i, media_el in enumerate(media_nodes):
 959             if manifest_version == '2.0':
 960                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 961                 if not media_url:
 962                     continue
 963                 manifest_url = (
 964                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 965                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 966                 # If media_url is itself a f4m manifest do the recursive extraction
 967                 # since bitrates in parent manifest (this one) and media_url manifest
 968                 # may differ leading to inability to resolve the format by requested
 969                 # bitrate in f4m downloader
 970                 if determine_ext(manifest_url) == 'f4m':
 971                     formats.extend(self._extract_f4m_formats(
 972                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 973                     continue
 974             tbr = int_or_none(media_el.attrib.get('bitrate'))
 975             formats.append({
 976                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 977                 'url': manifest_url,
 978                 'ext': 'flv',
 979                 'tbr': tbr,
 980                 'width': int_or_none(media_el.attrib.get('width')),
 981                 'height': int_or_none(media_el.attrib.get('height')),
 982                 'preference': preference,
 983             })
 984         self._sort_formats(formats)
 985
 986         return formats
 987
 988     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 989                               entry_protocol='m3u8', preference=None,
 990                               m3u8_id=None, note=None, errnote=None,
 991                               fatal=True):
 992
 993         formats = [{
 994             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 995             'url': m3u8_url,
 996             'ext': ext,
 997             'protocol': 'm3u8',
 998             'preference': preference - 1 if preference else -1,
 999             'resolution': 'multiple',
1000             'format_note': 'Quality selection URL',
1001         }]
1002
1003         format_url = lambda u: (
1004             u
1005             if re.match(r'^https?://', u)
1006             else compat_urlparse.urljoin(m3u8_url, u))
1007
1008         res = self._download_webpage_handle(
1009             m3u8_url, video_id,
1010             note=note or 'Downloading m3u8 information',
1011             errnote=errnote or 'Failed to download m3u8 information',
1012             fatal=fatal)
1013         if res is False:
1014             return []
1015         m3u8_doc, urlh = res
1016         m3u8_url = urlh.geturl()
1017         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1018         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1019         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1020         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1021         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1022             return [{
1023                 'url': m3u8_url,
1024                 'format_id': m3u8_id,
1025                 'ext': ext,
1026                 'protocol': entry_protocol,
1027                 'preference': preference,
1028             }]
1029         last_info = None
1030         last_media = None
1031         kv_rex = re.compile(
1032             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1033         for line in m3u8_doc.splitlines():
1034             if line.startswith('#EXT-X-STREAM-INF:'):
1035                 last_info = {}
1036                 for m in kv_rex.finditer(line):
1037                     v = m.group('val')
1038                     if v.startswith('"'):
1039                         v = v[1:-1]
1040                     last_info[m.group('key')] = v
1041             elif line.startswith('#EXT-X-MEDIA:'):
1042                 last_media = {}
1043                 for m in kv_rex.finditer(line):
1044                     v = m.group('val')
1045                     if v.startswith('"'):
1046                         v = v[1:-1]
1047                     last_media[m.group('key')] = v
1048             elif line.startswith('#') or not line.strip():
1049                 continue
1050             else:
1051                 if last_info is None:
1052                     formats.append({'url': format_url(line)})
1053                     continue
1054                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1055                 format_id = []
1056                 if m3u8_id:
1057                     format_id.append(m3u8_id)
1058                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1059                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1060                 f = {
1061                     'format_id': '-'.join(format_id),
1062                     'url': format_url(line.strip()),
1063                     'tbr': tbr,
1064                     'ext': ext,
1065                     'protocol': entry_protocol,
1066                     'preference': preference,
1067                 }
1068                 codecs = last_info.get('CODECS')
1069                 if codecs:
1070                     # TODO: looks like video codec is not always necessarily goes first
1071                     va_codecs = codecs.split(',')
1072                     if va_codecs[0]:
1073                         f['vcodec'] = va_codecs[0]
1074                     if len(va_codecs) > 1 and va_codecs[1]:
1075                         f['acodec'] = va_codecs[1]
1076                 resolution = last_info.get('RESOLUTION')
1077                 if resolution:
1078                     width_str, height_str = resolution.split('x')
1079                     f['width'] = int(width_str)
1080                     f['height'] = int(height_str)
1081                 if last_media is not None:
1082                     f['m3u8_media'] = last_media
1083                     last_media = None
1084                 formats.append(f)
1085                 last_info = {}
1086         self._sort_formats(formats)
1087         return formats
1088
1089     @staticmethod
1090     def _xpath_ns(path, namespace=None):
1091         if not namespace:
1092             return path
1093         out = []
1094         for c in path.split('/'):
1095             if not c or c == '.':
1096                 out.append(c)
1097             else:
1098                 out.append('{%s}%s' % (namespace, c))
1099         return '/'.join(out)
1100
1101     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1102         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1103
1104         if smil is False:
1105             assert not fatal
1106             return []
1107
1108         namespace = self._parse_smil_namespace(smil)
1109
1110         return self._parse_smil_formats(
1111             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1112
1113     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1114         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1115         if smil is False:
1116             return {}
1117         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1118
1119     def _download_smil(self, smil_url, video_id, fatal=True):
1120         return self._download_xml(
1121             smil_url, video_id, 'Downloading SMIL file',
1122             'Unable to download SMIL file', fatal=fatal)
1123
1124     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1125         namespace = self._parse_smil_namespace(smil)
1126
1127         formats = self._parse_smil_formats(
1128             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1129         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1130
1131         video_id = os.path.splitext(url_basename(smil_url))[0]
1132         title = None
1133         description = None
1134         upload_date = None
1135         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1136             name = meta.attrib.get('name')
1137             content = meta.attrib.get('content')
1138             if not name or not content:
1139                 continue
1140             if not title and name == 'title':
1141                 title = content
1142             elif not description and name in ('description', 'abstract'):
1143                 description = content
1144             elif not upload_date and name == 'date':
1145                 upload_date = unified_strdate(content)
1146
1147         thumbnails = [{
1148             'id': image.get('type'),
1149             'url': image.get('src'),
1150             'width': int_or_none(image.get('width')),
1151             'height': int_or_none(image.get('height')),
1152         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1153
1154         return {
1155             'id': video_id,
1156             'title': title or video_id,
1157             'description': description,
1158             'upload_date': upload_date,
1159             'thumbnails': thumbnails,
1160             'formats': formats,
1161             'subtitles': subtitles,
1162         }
1163
1164     def _parse_smil_namespace(self, smil):
1165         return self._search_regex(
1166             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1167
1168     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1169         base = smil_url
1170         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1171             b = meta.get('base') or meta.get('httpBase')
1172             if b:
1173                 base = b
1174                 break
1175
1176         formats = []
1177         rtmp_count = 0
1178         http_count = 0
1179         m3u8_count = 0
1180
1181         videos = smil.findall(self._xpath_ns('.//video', namespace))
1182         for video in videos:
1183             src = video.get('src')
1184             if not src:
1185                 continue
1186
1187             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1188             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1189             width = int_or_none(video.get('width'))
1190             height = int_or_none(video.get('height'))
1191             proto = video.get('proto')
1192             ext = video.get('ext')
1193             src_ext = determine_ext(src)
1194             streamer = video.get('streamer') or base
1195
1196             if proto == 'rtmp' or streamer.startswith('rtmp'):
1197                 rtmp_count += 1
1198                 formats.append({
1199                     'url': streamer,
1200                     'play_path': src,
1201                     'ext': 'flv',
1202                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1203                     'tbr': bitrate,
1204                     'filesize': filesize,
1205                     'width': width,
1206                     'height': height,
1207                 })
1208                 if transform_rtmp_url:
1209                     streamer, src = transform_rtmp_url(streamer, src)
1210                     formats[-1].update({
1211                         'url': streamer,
1212                         'play_path': src,
1213                     })
1214                 continue
1215
1216             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1217
1218             if proto == 'm3u8' or src_ext == 'm3u8':
1219                 m3u8_formats = self._extract_m3u8_formats(
1220                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1221                 if len(m3u8_formats) == 1:
1222                     m3u8_count += 1
1223                     m3u8_formats[0].update({
1224                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1225                         'tbr': bitrate,
1226                         'width': width,
1227                         'height': height,
1228                     })
1229                 formats.extend(m3u8_formats)
1230                 continue
1231
1232             if src_ext == 'f4m':
1233                 f4m_url = src_url
1234                 if not f4m_params:
1235                     f4m_params = {
1236                         'hdcore': '3.2.0',
1237                         'plugin': 'flowplayer-3.2.0.1',
1238                     }
1239                 f4m_url += '&' if '?' in f4m_url else '?'
1240                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1241                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1242                 continue
1243
1244             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1245                 http_count += 1
1246                 formats.append({
1247                     'url': src_url,
1248                     'ext': ext or src_ext or 'flv',
1249                     'format_id': 'http-%d' % (bitrate or http_count),
1250                     'tbr': bitrate,
1251                     'filesize': filesize,
1252                     'width': width,
1253                     'height': height,
1254                 })
1255                 continue
1256
1257         self._sort_formats(formats)
1258
1259         return formats
1260
1261     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1262         subtitles = {}
1263         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1264             src = textstream.get('src')
1265             if not src:
1266                 continue
1267             ext = textstream.get('ext') or determine_ext(src)
1268             if not ext:
1269                 type_ = textstream.get('type')
1270                 SUBTITLES_TYPES = {
1271                     'text/vtt': 'vtt',
1272                     'text/srt': 'srt',
1273                     'application/smptett+xml': 'tt',
1274                 }
1275                 if type_ in SUBTITLES_TYPES:
1276                     ext = SUBTITLES_TYPES[type_]
1277             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1278             subtitles.setdefault(lang, []).append({
1279                 'url': src,
1280                 'ext': ext,
1281             })
1282         return subtitles
1283
1284     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1285         xspf = self._download_xml(
1286             playlist_url, playlist_id, 'Downloading xpsf playlist',
1287             'Unable to download xspf manifest', fatal=fatal)
1288         if xspf is False:
1289             return []
1290         return self._parse_xspf(xspf, playlist_id)
1291
1292     def _parse_xspf(self, playlist, playlist_id):
1293         NS_MAP = {
1294             'xspf': 'http://xspf.org/ns/0/',
1295             's1': 'http://static.streamone.nl/player/ns/0',
1296         }
1297
1298         entries = []
1299         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1300             title = xpath_text(
1301                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1302             description = xpath_text(
1303                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1304             thumbnail = xpath_text(
1305                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1306             duration = float_or_none(
1307                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1308
1309             formats = [{
1310                 'url': location.text,
1311                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1312                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1313                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1314             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1315             self._sort_formats(formats)
1316
1317             entries.append({
1318                 'id': playlist_id,
1319                 'title': title,
1320                 'description': description,
1321                 'thumbnail': thumbnail,
1322                 'duration': duration,
1323                 'formats': formats,
1324             })
1325         return entries
1326
1327     def _live_title(self, name):
1328         """ Generate the title for a live video """
1329         now = datetime.datetime.now()
1330         now_str = now.strftime("%Y-%m-%d %H:%M")
1331         return name + ' ' + now_str
1332
1333     def _int(self, v, name, fatal=False, **kwargs):
1334         res = int_or_none(v, **kwargs)
1335         if 'get_attr' in kwargs:
1336             print(getattr(v, kwargs['get_attr']))
1337         if res is None:
1338             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1339             if fatal:
1340                 raise ExtractorError(msg)
1341             else:
1342                 self._downloader.report_warning(msg)
1343         return res
1344
1345     def _float(self, v, name, fatal=False, **kwargs):
1346         res = float_or_none(v, **kwargs)
1347         if res is None:
1348             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1349             if fatal:
1350                 raise ExtractorError(msg)
1351             else:
1352                 self._downloader.report_warning(msg)
1353         return res
1354
1355     def _set_cookie(self, domain, name, value, expire_time=None):
1356         cookie = compat_cookiejar.Cookie(
1357             0, name, value, None, None, domain, None,
1358             None, '/', True, False, expire_time, '', None, None, None)
1359         self._downloader.cookiejar.set_cookie(cookie)
1360
1361     def _get_cookies(self, url):
1362         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1363         req = sanitized_Request(url)
1364         self._downloader.cookiejar.add_cookie_header(req)
1365         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1366
1367     def get_testcases(self, include_onlymatching=False):
1368         t = getattr(self, '_TEST', None)
1369         if t:
1370             assert not hasattr(self, '_TESTS'), \
1371                 '%s has _TEST and _TESTS' % type(self).__name__
1372             tests = [t]
1373         else:
1374             tests = getattr(self, '_TESTS', [])
1375         for t in tests:
1376             if not include_onlymatching and t.get('only_matching', False):
1377                 continue
1378             t['name'] = type(self).__name__[:-len('IE')]
1379             yield t
1380
1381     def is_suitable(self, age_limit):
1382         """ Test whether the extractor is generally suitable for the given
1383         age limit (i.e. pornographic sites are not, all others usually are) """
1384
1385         any_restricted = False
1386         for tc in self.get_testcases(include_onlymatching=False):
1387             if 'playlist' in tc:
1388                 tc = tc['playlist'][0]
1389             is_restricted = age_restricted(
1390                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1391             if not is_restricted:
1392                 return True
1393             any_restricted = any_restricted or is_restricted
1394         return not any_restricted
1395
1396     def extract_subtitles(self, *args, **kwargs):
1397         if (self._downloader.params.get('writesubtitles', False) or
1398                 self._downloader.params.get('listsubtitles')):
1399             return self._get_subtitles(*args, **kwargs)
1400         return {}
1401
1402     def _get_subtitles(self, *args, **kwargs):
1403         raise NotImplementedError("This method must be implemented by subclasses")
1404
1405     @staticmethod
1406     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1407         """ Merge subtitle items for one language. Items with duplicated URLs
1408         will be dropped. """
1409         list1_urls = set([item['url'] for item in subtitle_list1])
1410         ret = list(subtitle_list1)
1411         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1412         return ret
1413
1414     @classmethod
1415     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1416         """ Merge two subtitle dictionaries, language by language. """
1417         ret = dict(subtitle_dict1)
1418         for lang in subtitle_dict2:
1419             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1420         return ret
1421
1422     def extract_automatic_captions(self, *args, **kwargs):
1423         if (self._downloader.params.get('writeautomaticsub', False) or
1424                 self._downloader.params.get('listsubtitles')):
1425             return self._get_automatic_captions(*args, **kwargs)
1426         return {}
1427
1428     def _get_automatic_captions(self, *args, **kwargs):
1429         raise NotImplementedError("This method must be implemented by subclasses")
1430
1431
1432 class SearchInfoExtractor(InfoExtractor):
1433     """
1434     Base class for paged search queries extractors.
1435     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1436     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1437     """
1438
1439     @classmethod
1440     def _make_valid_url(cls):
1441         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1442
1443     @classmethod
1444     def suitable(cls, url):
1445         return re.match(cls._make_valid_url(), url) is not None
1446
1447     def _real_extract(self, query):
1448         mobj = re.match(self._make_valid_url(), query)
1449         if mobj is None:
1450             raise ExtractorError('Invalid search query "%s"' % query)
1451
1452         prefix = mobj.group('prefix')
1453         query = mobj.group('query')
1454         if prefix == '':
1455             return self._get_n_results(query, 1)
1456         elif prefix == 'all':
1457             return self._get_n_results(query, self._MAX_RESULTS)
1458         else:
1459             n = int(prefix)
1460             if n <= 0:
1461                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1462             elif n > self._MAX_RESULTS:
1463                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1464                 n = self._MAX_RESULTS
1465             return self._get_n_results(query, n)
1466
1467     def _get_n_results(self, query, n):
1468         """Get a specified number of results for a query"""
1469         raise NotImplementedError("This method must be implemented by subclasses")
1470
1471     @property
1472     def SEARCH_KEY(self):
1473         return self._SEARCH_KEY