git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51 )
  52
  53
  54 class InfoExtractor(object):
  55     """Information Extractor class.
  56
  57     Information extractors are the classes that, given a URL, extract
  58     information about the video (or videos) the URL refers to. This
  59     information includes the real video URL, the video title, author and
  60     others. The information is stored in a dictionary which is then
  61     passed to the YoutubeDL. The YoutubeDL processes this
  62     information possibly downloading the video to the file system, among
  63     other possible outcomes.
  64
  65     The type field determines the type of the result.
  66     By far the most common value (and the default if _type is missing) is
  67     "video", which indicates a single video.
  68
  69     For a video, the dictionaries must include the following fields:
  70
  71     id:             Video identifier.
  72     title:          Video title, unescaped.
  73
  74     Additionally, it must contain either a formats entry or a url one:
  75
  76     formats:        A list of dictionaries for each format available, ordered
  77                     from worst to best quality.
  78
  79                     Potential fields:
  80                     * url        Mandatory. The URL of the video file
  81                     * ext        Will be calculated from URL if missing
  82                     * format     A human-readable description of the format
  83                                  ("mp4 container with h264/opus").
  84                                  Calculated from the format_id, width, height.
  85                                  and format_note fields if missing.
  86                     * format_id  A short description of the format
  87                                  ("mp4_h264_opus" or "19").
  88                                 Technically optional, but strongly recommended.
  89                     * format_note Additional info about the format
  90                                  ("3D" or "DASH video")
  91                     * width      Width of the video, if known
  92                     * height     Height of the video, if known
  93                     * resolution Textual description of width and height
  94                     * tbr        Average bitrate of audio and video in KBit/s
  95                     * abr        Average audio bitrate in KBit/s
  96                     * acodec     Name of the audio codec in use
  97                     * asr        Audio sampling rate in Hertz
  98                     * vbr        Average video bitrate in KBit/s
  99                     * fps        Frame rate
 100                     * vcodec     Name of the video codec in use
 101                     * container  Name of the container format
 102                     * filesize   The number of bytes, if known in advance
 103                     * filesize_approx  An estimate for the number of bytes
 104                     * player_url SWF Player URL (used for rtmpdump).
 105                     * protocol   The protocol that will be used for the actual
 106                                  download, lower-case.
 107                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 108                                  "m3u8", or "m3u8_native".
 109                     * preference Order number of this format. If this field is
 110                                  present and not None, the formats get sorted
 111                                  by this field, regardless of all other values.
 112                                  -1 for default (order by other properties),
 113                                  -2 or smaller for less than default.
 114                                  < -1000 to hide the format (if there is
 115                                     another one which is strictly better)
 116                     * language   Language code, e.g. "de" or "en-US".
 117                     * language_preference  Is this in the language mentioned in
 118                                  the URL?
 119                                  10 if it's what the URL is about,
 120                                  -1 for default (don't know),
 121                                  -10 otherwise, other values reserved for now.
 122                     * quality    Order number of the video quality of this
 123                                  format, irrespective of the file format.
 124                                  -1 for default (order by other properties),
 125                                  -2 or smaller for less than default.
 126                     * source_preference  Order number for this video source
 127                                   (quality takes higher priority)
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                     * http_headers  A dictionary of additional HTTP headers
 131                                  to add to the request.
 132                     * stretched_ratio  If given and not 1, indicates that the
 133                                  video's pixels are not square.
 134                                  width : height ratio as float.
 135                     * no_resume  The server does not support resuming the
 136                                  (HTTP or RTMP) download. Boolean.
 137
 138     url:            Final video URL.
 139     ext:            Video filename extension.
 140     format:         The video format, defaults to ext (used for --get-format)
 141     player_url:     SWF Player URL (used for rtmpdump).
 142
 143     The following fields are optional:
 144
 145     alt_title:      A secondary title of the video.
 146     display_id      An alternative identifier for the video, not necessarily
 147                     unique, but available before title. Typically, id is
 148                     something like "4234987", title "Dancing naked mole rats",
 149                     and display_id "dancing-naked-mole-rats"
 150     thumbnails:     A list of dictionaries, with the following entries:
 151                         * "id" (optional, string) - Thumbnail format ID
 152                         * "url"
 153                         * "preference" (optional, int) - quality of the image
 154                         * "width" (optional, int)
 155                         * "height" (optional, int)
 156                         * "resolution" (optional, string "{width}x{height"},
 157                                         deprecated)
 158     thumbnail:      Full URL to a video thumbnail image.
 159     description:    Full video description.
 160     uploader:       Full name of the video uploader.
 161     license:        License name the video is licensed under.
 162     creator:        The main artist who created the video.
 163     release_date:   The date (YYYYMMDD) when the video was released.
 164     timestamp:      UNIX timestamp of the moment the video became available.
 165     upload_date:    Video upload date (YYYYMMDD).
 166                     If not explicitly set, calculated from timestamp.
 167     uploader_id:    Nickname or id of the video uploader.
 168     uploader_url:   Full URL to a personal webpage of the video uploader.
 169     location:       Physical location where the video was filmed.
 170     subtitles:      The available subtitles as a dictionary in the format
 171                     {language: subformats}. "subformats" is a list sorted from
 172                     lower to higher preference, each element is a dictionary
 173                     with the "ext" entry and one of:
 174                         * "data": The subtitles file contents
 175                         * "url": A URL pointing to the subtitles file
 176                     "ext" will be calculated from URL if missing
 177     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 178                     automatically generated captions
 179     duration:       Length of the video in seconds, as an integer or float.
 180     view_count:     How many users have watched the video on the platform.
 181     like_count:     Number of positive ratings of the video
 182     dislike_count:  Number of negative ratings of the video
 183     repost_count:   Number of reposts of the video
 184     average_rating: Average rating give by users, the scale used depends on the webpage
 185     comment_count:  Number of comments on the video
 186     comments:       A list of comments, each with one or more of the following
 187                     properties (all but one of text or html optional):
 188                         * "author" - human-readable name of the comment author
 189                         * "author_id" - user ID of the comment author
 190                         * "id" - Comment ID
 191                         * "html" - Comment as HTML
 192                         * "text" - Plain text of the comment
 193                         * "timestamp" - UNIX timestamp of comment
 194                         * "parent" - ID of the comment this one is replying to.
 195                                      Set to "root" to indicate that this is a
 196                                      comment to the original video.
 197     age_limit:      Age restriction for the video, as an integer (years)
 198     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 199                     should allow to get the same result again. (It will be set
 200                     by YoutubeDL if it's missing)
 201     categories:     A list of categories that the video falls in, for example
 202                     ["Sports", "Berlin"]
 203     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 204     is_live:        True, False, or None (=unknown). Whether this video is a
 205                     live stream that goes on instead of a fixed-length video.
 206     start_time:     Time in seconds where the reproduction should start, as
 207                     specified in the URL.
 208     end_time:       Time in seconds where the reproduction should end, as
 209                     specified in the URL.
 210
 211     The following fields should only be used when the video belongs to some logical
 212     chapter or section:
 213
 214     chapter:        Name or title of the chapter the video belongs to.
 215     chapter_number: Number of the chapter the video belongs to, as an integer.
 216     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 217
 218     The following fields should only be used when the video is an episode of some
 219     series or programme:
 220
 221     series:         Title of the series or programme the video episode belongs to.
 222     season:         Title of the season the video episode belongs to.
 223     season_number:  Number of the season the video episode belongs to, as an integer.
 224     season_id:      Id of the season the video episode belongs to, as a unicode string.
 225     episode:        Title of the video episode. Unlike mandatory video title field,
 226                     this field should denote the exact title of the video episode
 227                     without any kind of decoration.
 228     episode_number: Number of the video episode within a season, as an integer.
 229     episode_id:     Id of the video episode, as a unicode string.
 230
 231     Unless mentioned otherwise, the fields should be Unicode strings.
 232
 233     Unless mentioned otherwise, None is equivalent to absence of information.
 234
 235
 236     _type "playlist" indicates multiple videos.
 237     There must be a key "entries", which is a list, an iterable, or a PagedList
 238     object, each element of which is a valid dictionary by this specification.
 239
 240     Additionally, playlists can have "title", "description" and "id" attributes
 241     with the same semantics as videos (see above).
 242
 243
 244     _type "multi_video" indicates that there are multiple videos that
 245     form a single show, for examples multiple acts of an opera or TV episode.
 246     It must have an entries key like a playlist and contain all the keys
 247     required for a video at the same time.
 248
 249
 250     _type "url" indicates that the video must be extracted from another
 251     location, possibly by a different extractor. Its only required key is:
 252     "url" - the next URL to extract.
 253     The key "ie_key" can be set to the class name (minus the trailing "IE",
 254     e.g. "Youtube") if the extractor class is known in advance.
 255     Additionally, the dictionary may have any properties of the resolved entity
 256     known in advance, for example "title" if the title of the referred video is
 257     known ahead of time.
 258
 259
 260     _type "url_transparent" entities have the same specification as "url", but
 261     indicate that the given additional information is more precise than the one
 262     associated with the resolved URL.
 263     This is useful when a site employs a video service that hosts the video and
 264     its technical metadata, but that video service does not embed a useful
 265     title, description etc.
 266
 267
 268     Subclasses of this one should re-define the _real_initialize() and
 269     _real_extract() methods and define a _VALID_URL regexp.
 270     Probably, they should also be added to the list of extractors.
 271
 272     Finally, the _WORKING attribute should be set to False for broken IEs
 273     in order to warn the users and skip the tests.
 274     """
 275
 276     _ready = False
 277     _downloader = None
 278     _WORKING = True
 279
 280     def __init__(self, downloader=None):
 281         """Constructor. Receives an optional downloader."""
 282         self._ready = False
 283         self.set_downloader(downloader)
 284
 285     @classmethod
 286     def suitable(cls, url):
 287         """Receives a URL and returns True if suitable for this IE."""
 288
 289         # This does not use has/getattr intentionally - we want to know whether
 290         # we have cached the regexp for *this* class, whereas getattr would also
 291         # match the superclass
 292         if '_VALID_URL_RE' not in cls.__dict__:
 293             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 294         return cls._VALID_URL_RE.match(url) is not None
 295
 296     @classmethod
 297     def _match_id(cls, url):
 298         if '_VALID_URL_RE' not in cls.__dict__:
 299             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 300         m = cls._VALID_URL_RE.match(url)
 301         assert m
 302         return m.group('id')
 303
 304     @classmethod
 305     def working(cls):
 306         """Getter method for _WORKING."""
 307         return cls._WORKING
 308
 309     def initialize(self):
 310         """Initializes an instance (authentication, etc)."""
 311         if not self._ready:
 312             self._real_initialize()
 313             self._ready = True
 314
 315     def extract(self, url):
 316         """Extracts URL information and returns it in list of dicts."""
 317         try:
 318             self.initialize()
 319             return self._real_extract(url)
 320         except ExtractorError:
 321             raise
 322         except compat_http_client.IncompleteRead as e:
 323             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 324         except (KeyError, StopIteration) as e:
 325             raise ExtractorError('An extractor error has occurred.', cause=e)
 326
 327     def set_downloader(self, downloader):
 328         """Sets the downloader for this IE."""
 329         self._downloader = downloader
 330
 331     def _real_initialize(self):
 332         """Real initialization process. Redefine in subclasses."""
 333         pass
 334
 335     def _real_extract(self, url):
 336         """Real extraction process. Redefine in subclasses."""
 337         pass
 338
 339     @classmethod
 340     def ie_key(cls):
 341         """A string for getting the InfoExtractor with get_info_extractor"""
 342         return compat_str(cls.__name__[:-2])
 343
 344     @property
 345     def IE_NAME(self):
 346         return compat_str(type(self).__name__[:-2])
 347
 348     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 349         """ Returns the response handle """
 350         if note is None:
 351             self.report_download_webpage(video_id)
 352         elif note is not False:
 353             if video_id is None:
 354                 self.to_screen('%s' % (note,))
 355             else:
 356                 self.to_screen('%s: %s' % (video_id, note))
 357         try:
 358             return self._downloader.urlopen(url_or_request)
 359         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 360             if errnote is False:
 361                 return False
 362             if errnote is None:
 363                 errnote = 'Unable to download webpage'
 364
 365             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 366             if fatal:
 367                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 368             else:
 369                 self._downloader.report_warning(errmsg)
 370                 return False
 371
 372     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 373         """ Returns a tuple (page content as string, URL handle) """
 374         # Strip hashes from the URL (#1038)
 375         if isinstance(url_or_request, (compat_str, str)):
 376             url_or_request = url_or_request.partition('#')[0]
 377
 378         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 379         if urlh is False:
 380             assert not fatal
 381             return False
 382         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 383         return (content, urlh)
 384
 385     @staticmethod
 386     def _guess_encoding_from_content(content_type, webpage_bytes):
 387         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 388         if m:
 389             encoding = m.group(1)
 390         else:
 391             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 392                           webpage_bytes[:1024])
 393             if m:
 394                 encoding = m.group(1).decode('ascii')
 395             elif webpage_bytes.startswith(b'\xff\xfe'):
 396                 encoding = 'utf-16'
 397             else:
 398                 encoding = 'utf-8'
 399
 400         return encoding
 401
 402     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 403         content_type = urlh.headers.get('Content-Type', '')
 404         webpage_bytes = urlh.read()
 405         if prefix is not None:
 406             webpage_bytes = prefix + webpage_bytes
 407         if not encoding:
 408             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 409         if self._downloader.params.get('dump_intermediate_pages', False):
 410             try:
 411                 url = url_or_request.get_full_url()
 412             except AttributeError:
 413                 url = url_or_request
 414             self.to_screen('Dumping request to ' + url)
 415             dump = base64.b64encode(webpage_bytes).decode('ascii')
 416             self._downloader.to_screen(dump)
 417         if self._downloader.params.get('write_pages', False):
 418             try:
 419                 url = url_or_request.get_full_url()
 420             except AttributeError:
 421                 url = url_or_request
 422             basen = '%s_%s' % (video_id, url)
 423             if len(basen) > 240:
 424                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 425                 basen = basen[:240 - len(h)] + h
 426             raw_filename = basen + '.dump'
 427             filename = sanitize_filename(raw_filename, restricted=True)
 428             self.to_screen('Saving request to ' + filename)
 429             # Working around MAX_PATH limitation on Windows (see
 430             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 431             if compat_os_name == 'nt':
 432                 absfilepath = os.path.abspath(filename)
 433                 if len(absfilepath) > 259:
 434                     filename = '\\\\?\\' + absfilepath
 435             with open(filename, 'wb') as outf:
 436                 outf.write(webpage_bytes)
 437
 438         try:
 439             content = webpage_bytes.decode(encoding, 'replace')
 440         except LookupError:
 441             content = webpage_bytes.decode('utf-8', 'replace')
 442
 443         if ('<title>Access to this site is blocked</title>' in content and
 444                 'Websense' in content[:512]):
 445             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 446             blocked_iframe = self._html_search_regex(
 447                 r'<iframe src="([^"]+)"', content,
 448                 'Websense information URL', default=None)
 449             if blocked_iframe:
 450                 msg += ' Visit %s for more details' % blocked_iframe
 451             raise ExtractorError(msg, expected=True)
 452         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 453             msg = (
 454                 'Access to this webpage has been blocked by Indian censorship. '
 455                 'Use a VPN or proxy server (with --proxy) to route around it.')
 456             block_msg = self._html_search_regex(
 457                 r'</h1><p>(.*?)</p>',
 458                 content, 'block message', default=None)
 459             if block_msg:
 460                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 461             raise ExtractorError(msg, expected=True)
 462
 463         return content
 464
 465     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 466         """ Returns the data of the page as a string """
 467         success = False
 468         try_count = 0
 469         while success is False:
 470             try:
 471                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 472                 success = True
 473             except compat_http_client.IncompleteRead as e:
 474                 try_count += 1
 475                 if try_count >= tries:
 476                     raise e
 477                 self._sleep(timeout, video_id)
 478         if res is False:
 479             return res
 480         else:
 481             content, _ = res
 482             return content
 483
 484     def _download_xml(self, url_or_request, video_id,
 485                       note='Downloading XML', errnote='Unable to download XML',
 486                       transform_source=None, fatal=True, encoding=None):
 487         """Return the xml as an xml.etree.ElementTree.Element"""
 488         xml_string = self._download_webpage(
 489             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 490         if xml_string is False:
 491             return xml_string
 492         if transform_source:
 493             xml_string = transform_source(xml_string)
 494         return compat_etree_fromstring(xml_string.encode('utf-8'))
 495
 496     def _download_json(self, url_or_request, video_id,
 497                        note='Downloading JSON metadata',
 498                        errnote='Unable to download JSON metadata',
 499                        transform_source=None,
 500                        fatal=True, encoding=None):
 501         json_string = self._download_webpage(
 502             url_or_request, video_id, note, errnote, fatal=fatal,
 503             encoding=encoding)
 504         if (not fatal) and json_string is False:
 505             return None
 506         return self._parse_json(
 507             json_string, video_id, transform_source=transform_source, fatal=fatal)
 508
 509     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 510         if transform_source:
 511             json_string = transform_source(json_string)
 512         try:
 513             return json.loads(json_string)
 514         except ValueError as ve:
 515             errmsg = '%s: Failed to parse JSON ' % video_id
 516             if fatal:
 517                 raise ExtractorError(errmsg, cause=ve)
 518             else:
 519                 self.report_warning(errmsg + str(ve))
 520
 521     def report_warning(self, msg, video_id=None):
 522         idstr = '' if video_id is None else '%s: ' % video_id
 523         self._downloader.report_warning(
 524             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 525
 526     def to_screen(self, msg):
 527         """Print msg to screen, prefixing it with '[ie_name]'"""
 528         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 529
 530     def report_extraction(self, id_or_name):
 531         """Report information extraction."""
 532         self.to_screen('%s: Extracting information' % id_or_name)
 533
 534     def report_download_webpage(self, video_id):
 535         """Report webpage download."""
 536         self.to_screen('%s: Downloading webpage' % video_id)
 537
 538     def report_age_confirmation(self):
 539         """Report attempt to confirm age."""
 540         self.to_screen('Confirming age')
 541
 542     def report_login(self):
 543         """Report attempt to log in."""
 544         self.to_screen('Logging in')
 545
 546     @staticmethod
 547     def raise_login_required(msg='This video is only available for registered users'):
 548         raise ExtractorError(
 549             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 550             expected=True)
 551
 552     @staticmethod
 553     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 554         raise ExtractorError(
 555             '%s. You might want to use --proxy to workaround.' % msg,
 556             expected=True)
 557
 558     # Methods for following #608
 559     @staticmethod
 560     def url_result(url, ie=None, video_id=None, video_title=None):
 561         """Returns a URL that points to a page that should be processed"""
 562         # TODO: ie should be the class used for getting the info
 563         video_info = {'_type': 'url',
 564                       'url': url,
 565                       'ie_key': ie}
 566         if video_id is not None:
 567             video_info['id'] = video_id
 568         if video_title is not None:
 569             video_info['title'] = video_title
 570         return video_info
 571
 572     @staticmethod
 573     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 574         """Returns a playlist"""
 575         video_info = {'_type': 'playlist',
 576                       'entries': entries}
 577         if playlist_id:
 578             video_info['id'] = playlist_id
 579         if playlist_title:
 580             video_info['title'] = playlist_title
 581         if playlist_description:
 582             video_info['description'] = playlist_description
 583         return video_info
 584
 585     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 586         """
 587         Perform a regex search on the given string, using a single or a list of
 588         patterns returning the first matching group.
 589         In case of failure return a default value or raise a WARNING or a
 590         RegexNotFoundError, depending on fatal, specifying the field name.
 591         """
 592         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 593             mobj = re.search(pattern, string, flags)
 594         else:
 595             for p in pattern:
 596                 mobj = re.search(p, string, flags)
 597                 if mobj:
 598                     break
 599
 600         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 601             _name = '\033[0;34m%s\033[0m' % name
 602         else:
 603             _name = name
 604
 605         if mobj:
 606             if group is None:
 607                 # return the first matching group
 608                 return next(g for g in mobj.groups() if g is not None)
 609             else:
 610                 return mobj.group(group)
 611         elif default is not NO_DEFAULT:
 612             return default
 613         elif fatal:
 614             raise RegexNotFoundError('Unable to extract %s' % _name)
 615         else:
 616             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 617             return None
 618
 619     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 620         """
 621         Like _search_regex, but strips HTML tags and unescapes entities.
 622         """
 623         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 624         if res:
 625             return clean_html(res).strip()
 626         else:
 627             return res
 628
 629     def _get_login_info(self):
 630         """
 631         Get the login info as (username, password)
 632         It will look in the netrc file using the _NETRC_MACHINE value
 633         If there's no info available, return (None, None)
 634         """
 635         if self._downloader is None:
 636             return (None, None)
 637
 638         username = None
 639         password = None
 640         downloader_params = self._downloader.params
 641
 642         # Attempt to use provided username and password or .netrc data
 643         if downloader_params.get('username') is not None:
 644             username = downloader_params['username']
 645             password = downloader_params['password']
 646         elif downloader_params.get('usenetrc', False):
 647             try:
 648                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 649                 if info is not None:
 650                     username = info[0]
 651                     password = info[2]
 652                 else:
 653                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 654             except (IOError, netrc.NetrcParseError) as err:
 655                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 656
 657         return (username, password)
 658
 659     def _get_tfa_info(self, note='two-factor verification code'):
 660         """
 661         Get the two-factor authentication info
 662         TODO - asking the user will be required for sms/phone verify
 663         currently just uses the command line option
 664         If there's no info available, return None
 665         """
 666         if self._downloader is None:
 667             return None
 668         downloader_params = self._downloader.params
 669
 670         if downloader_params.get('twofactor') is not None:
 671             return downloader_params['twofactor']
 672
 673         return compat_getpass('Type %s and press [Return]: ' % note)
 674
 675     # Helper functions for extracting OpenGraph info
 676     @staticmethod
 677     def _og_regexes(prop):
 678         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 679         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 680                        % {'prop': re.escape(prop)})
 681         template = r'<meta[^>]+?%s[^>]+?%s'
 682         return [
 683             template % (property_re, content_re),
 684             template % (content_re, property_re),
 685         ]
 686
 687     @staticmethod
 688     def _meta_regex(prop):
 689         return r'''(?isx)<meta
 690                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 691                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 692
 693     def _og_search_property(self, prop, html, name=None, **kargs):
 694         if name is None:
 695             name = 'OpenGraph %s' % prop
 696         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 697         if escaped is None:
 698             return None
 699         return unescapeHTML(escaped)
 700
 701     def _og_search_thumbnail(self, html, **kargs):
 702         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 703
 704     def _og_search_description(self, html, **kargs):
 705         return self._og_search_property('description', html, fatal=False, **kargs)
 706
 707     def _og_search_title(self, html, **kargs):
 708         return self._og_search_property('title', html, **kargs)
 709
 710     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 711         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 712         if secure:
 713             regexes = self._og_regexes('video:secure_url') + regexes
 714         return self._html_search_regex(regexes, html, name, **kargs)
 715
 716     def _og_search_url(self, html, **kargs):
 717         return self._og_search_property('url', html, **kargs)
 718
 719     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 720         if display_name is None:
 721             display_name = name
 722         return self._html_search_regex(
 723             self._meta_regex(name),
 724             html, display_name, fatal=fatal, group='content', **kwargs)
 725
 726     def _dc_search_uploader(self, html):
 727         return self._html_search_meta('dc.creator', html, 'uploader')
 728
 729     def _rta_search(self, html):
 730         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 731         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 732                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 733                      html):
 734             return 18
 735         return 0
 736
 737     def _media_rating_search(self, html):
 738         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 739         rating = self._html_search_meta('rating', html)
 740
 741         if not rating:
 742             return None
 743
 744         RATING_TABLE = {
 745             'safe for kids': 0,
 746             'general': 8,
 747             '14 years': 14,
 748             'mature': 17,
 749             'restricted': 19,
 750         }
 751         return RATING_TABLE.get(rating.lower())
 752
 753     def _family_friendly_search(self, html):
 754         # See http://schema.org/VideoObject
 755         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 756
 757         if not family_friendly:
 758             return None
 759
 760         RATING_TABLE = {
 761             '1': 0,
 762             'true': 0,
 763             '0': 18,
 764             'false': 18,
 765         }
 766         return RATING_TABLE.get(family_friendly.lower())
 767
 768     def _twitter_search_player(self, html):
 769         return self._html_search_meta('twitter:player', html,
 770                                       'twitter card player')
 771
 772     def _search_json_ld(self, html, video_id, **kwargs):
 773         json_ld = self._search_regex(
 774             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 775             html, 'JSON-LD', group='json_ld', **kwargs)
 776         if not json_ld:
 777             return {}
 778         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 779
 780     def _json_ld(self, json_ld, video_id, fatal=True):
 781         if isinstance(json_ld, compat_str):
 782             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 783         if not json_ld:
 784             return {}
 785         info = {}
 786         if json_ld.get('@context') == 'http://schema.org':
 787             item_type = json_ld.get('@type')
 788             if item_type == 'TVEpisode':
 789                 info.update({
 790                     'episode': unescapeHTML(json_ld.get('name')),
 791                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 792                     'description': unescapeHTML(json_ld.get('description')),
 793                 })
 794                 part_of_season = json_ld.get('partOfSeason')
 795                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 796                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 797                 part_of_series = json_ld.get('partOfSeries')
 798                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 799                     info['series'] = unescapeHTML(part_of_series.get('name'))
 800             elif item_type == 'Article':
 801                 info.update({
 802                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 803                     'title': unescapeHTML(json_ld.get('headline')),
 804                     'description': unescapeHTML(json_ld.get('articleBody')),
 805                 })
 806         return dict((k, v) for k, v in info.items() if v is not None)
 807
 808     @staticmethod
 809     def _hidden_inputs(html):
 810         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 811         hidden_inputs = {}
 812         for input in re.findall(r'(?i)<input([^>]+)>', html):
 813             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 814                 continue
 815             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 816             if not name:
 817                 continue
 818             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 819             if not value:
 820                 continue
 821             hidden_inputs[name.group('value')] = value.group('value')
 822         return hidden_inputs
 823
 824     def _form_hidden_inputs(self, form_id, html):
 825         form = self._search_regex(
 826             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 827             html, '%s form' % form_id, group='form')
 828         return self._hidden_inputs(form)
 829
 830     def _sort_formats(self, formats, field_preference=None):
 831         if not formats:
 832             raise ExtractorError('No video formats found')
 833
 834         for f in formats:
 835             # Automatically determine tbr when missing based on abr and vbr (improves
 836             # formats sorting in some cases)
 837             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 838                 f['tbr'] = f['abr'] + f['vbr']
 839
 840         def _formats_key(f):
 841             # TODO remove the following workaround
 842             from ..utils import determine_ext
 843             if not f.get('ext') and 'url' in f:
 844                 f['ext'] = determine_ext(f['url'])
 845
 846             if isinstance(field_preference, (list, tuple)):
 847                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 848
 849             preference = f.get('preference')
 850             if preference is None:
 851                 preference = 0
 852                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 853                     preference -= 0.5
 854
 855             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 856
 857             if f.get('vcodec') == 'none':  # audio only
 858                 if self._downloader.params.get('prefer_free_formats'):
 859                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 860                 else:
 861                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 862                 ext_preference = 0
 863                 try:
 864                     audio_ext_preference = ORDER.index(f['ext'])
 865                 except ValueError:
 866                     audio_ext_preference = -1
 867             else:
 868                 if self._downloader.params.get('prefer_free_formats'):
 869                     ORDER = ['flv', 'mp4', 'webm']
 870                 else:
 871                     ORDER = ['webm', 'flv', 'mp4']
 872                 try:
 873                     ext_preference = ORDER.index(f['ext'])
 874                 except ValueError:
 875                     ext_preference = -1
 876                 audio_ext_preference = 0
 877
 878             return (
 879                 preference,
 880                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 881                 f.get('quality') if f.get('quality') is not None else -1,
 882                 f.get('tbr') if f.get('tbr') is not None else -1,
 883                 f.get('filesize') if f.get('filesize') is not None else -1,
 884                 f.get('vbr') if f.get('vbr') is not None else -1,
 885                 f.get('height') if f.get('height') is not None else -1,
 886                 f.get('width') if f.get('width') is not None else -1,
 887                 proto_preference,
 888                 ext_preference,
 889                 f.get('abr') if f.get('abr') is not None else -1,
 890                 audio_ext_preference,
 891                 f.get('fps') if f.get('fps') is not None else -1,
 892                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 893                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 894                 f.get('format_id') if f.get('format_id') is not None else '',
 895             )
 896         formats.sort(key=_formats_key)
 897
 898     def _check_formats(self, formats, video_id):
 899         if formats:
 900             formats[:] = filter(
 901                 lambda f: self._is_valid_url(
 902                     f['url'], video_id,
 903                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 904                 formats)
 905
 906     @staticmethod
 907     def _remove_duplicate_formats(formats):
 908         format_urls = set()
 909         unique_formats = []
 910         for f in formats:
 911             if f['url'] not in format_urls:
 912                 format_urls.add(f['url'])
 913                 unique_formats.append(f)
 914         formats[:] = unique_formats
 915
 916     def _is_valid_url(self, url, video_id, item='video'):
 917         url = self._proto_relative_url(url, scheme='http:')
 918         # For now assume non HTTP(S) URLs always valid
 919         if not (url.startswith('http://') or url.startswith('https://')):
 920             return True
 921         try:
 922             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 923             return True
 924         except ExtractorError as e:
 925             if isinstance(e.cause, compat_urllib_error.URLError):
 926                 self.to_screen(
 927                     '%s: %s URL is invalid, skipping' % (video_id, item))
 928                 return False
 929             raise
 930
 931     def http_scheme(self):
 932         """ Either "http:" or "https:", depending on the user's preferences """
 933         return (
 934             'http:'
 935             if self._downloader.params.get('prefer_insecure', False)
 936             else 'https:')
 937
 938     def _proto_relative_url(self, url, scheme=None):
 939         if url is None:
 940             return url
 941         if url.startswith('//'):
 942             if scheme is None:
 943                 scheme = self.http_scheme()
 944             return scheme + url
 945         else:
 946             return url
 947
 948     def _sleep(self, timeout, video_id, msg_template=None):
 949         if msg_template is None:
 950             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 951         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 952         self.to_screen(msg)
 953         time.sleep(timeout)
 954
 955     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 956                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 957                              fatal=True):
 958         manifest = self._download_xml(
 959             manifest_url, video_id, 'Downloading f4m manifest',
 960             'Unable to download f4m manifest',
 961             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 962             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 963             transform_source=transform_source,
 964             fatal=fatal)
 965
 966         if manifest is False:
 967             return []
 968
 969         formats = []
 970         manifest_version = '1.0'
 971         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 972         if not media_nodes:
 973             manifest_version = '2.0'
 974             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 975         base_url = xpath_text(
 976             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 977             'base URL', default=None)
 978         if base_url:
 979             base_url = base_url.strip()
 980         for i, media_el in enumerate(media_nodes):
 981             if manifest_version == '2.0':
 982                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 983                 if not media_url:
 984                     continue
 985                 manifest_url = (
 986                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 987                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 988                 # If media_url is itself a f4m manifest do the recursive extraction
 989                 # since bitrates in parent manifest (this one) and media_url manifest
 990                 # may differ leading to inability to resolve the format by requested
 991                 # bitrate in f4m downloader
 992                 if determine_ext(manifest_url) == 'f4m':
 993                     formats.extend(self._extract_f4m_formats(
 994                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 995                     continue
 996             tbr = int_or_none(media_el.attrib.get('bitrate'))
 997             formats.append({
 998                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 999                 'url': manifest_url,
1000                 'ext': 'flv',
1001                 'tbr': tbr,
1002                 'width': int_or_none(media_el.attrib.get('width')),
1003                 'height': int_or_none(media_el.attrib.get('height')),
1004                 'preference': preference,
1005             })
1006         self._sort_formats(formats)
1007
1008         return formats
1009
1010     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1011                               entry_protocol='m3u8', preference=None,
1012                               m3u8_id=None, note=None, errnote=None,
1013                               fatal=True):
1014
1015         formats = [{
1016             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1017             'url': m3u8_url,
1018             'ext': ext,
1019             'protocol': 'm3u8',
1020             'preference': preference - 1 if preference else -1,
1021             'resolution': 'multiple',
1022             'format_note': 'Quality selection URL',
1023         }]
1024
1025         format_url = lambda u: (
1026             u
1027             if re.match(r'^https?://', u)
1028             else compat_urlparse.urljoin(m3u8_url, u))
1029
1030         res = self._download_webpage_handle(
1031             m3u8_url, video_id,
1032             note=note or 'Downloading m3u8 information',
1033             errnote=errnote or 'Failed to download m3u8 information',
1034             fatal=fatal)
1035         if res is False:
1036             return []
1037         m3u8_doc, urlh = res
1038         m3u8_url = urlh.geturl()
1039
1040         # We should try extracting formats only from master playlists [1], i.e.
1041         # playlists that describe available qualities. On the other hand media
1042         # playlists [2] should be returned as is since they contain just the media
1043         # without qualities renditions.
1044         # Fortunately, master playlist can be easily distinguished from media
1045         # playlist based on particular tags availability. As of [1, 2] master
1046         # playlist tags MUST NOT appear in a media playist and vice versa.
1047         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1048         # and MUST NOT appear in master playlist thus we can clearly detect media
1049         # playlist with this criterion.
1050         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1051         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1052         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1053         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1054             return [{
1055                 'url': m3u8_url,
1056                 'format_id': m3u8_id,
1057                 'ext': ext,
1058                 'protocol': entry_protocol,
1059                 'preference': preference,
1060             }]
1061         last_info = None
1062         last_media = None
1063         kv_rex = re.compile(
1064             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1065         for line in m3u8_doc.splitlines():
1066             if line.startswith('#EXT-X-STREAM-INF:'):
1067                 last_info = {}
1068                 for m in kv_rex.finditer(line):
1069                     v = m.group('val')
1070                     if v.startswith('"'):
1071                         v = v[1:-1]
1072                     last_info[m.group('key')] = v
1073             elif line.startswith('#EXT-X-MEDIA:'):
1074                 last_media = {}
1075                 for m in kv_rex.finditer(line):
1076                     v = m.group('val')
1077                     if v.startswith('"'):
1078                         v = v[1:-1]
1079                     last_media[m.group('key')] = v
1080             elif line.startswith('#') or not line.strip():
1081                 continue
1082             else:
1083                 if last_info is None:
1084                     formats.append({'url': format_url(line)})
1085                     continue
1086                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1087                 format_id = []
1088                 if m3u8_id:
1089                     format_id.append(m3u8_id)
1090                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1091                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1092                 f = {
1093                     'format_id': '-'.join(format_id),
1094                     'url': format_url(line.strip()),
1095                     'tbr': tbr,
1096                     'ext': ext,
1097                     'protocol': entry_protocol,
1098                     'preference': preference,
1099                 }
1100                 resolution = last_info.get('RESOLUTION')
1101                 if resolution:
1102                     width_str, height_str = resolution.split('x')
1103                     f['width'] = int(width_str)
1104                     f['height'] = int(height_str)
1105                 codecs = last_info.get('CODECS')
1106                 if codecs:
1107                     vcodec, acodec = [None] * 2
1108                     va_codecs = codecs.split(',')
1109                     if len(va_codecs) == 1:
1110                         # Audio only entries usually come with single codec and
1111                         # no resolution. For more robustness we also check it to
1112                         # be mp4 audio.
1113                         if not resolution and va_codecs[0].startswith('mp4a'):
1114                             vcodec, acodec = 'none', va_codecs[0]
1115                         else:
1116                             vcodec = va_codecs[0]
1117                     else:
1118                         vcodec, acodec = va_codecs[:2]
1119                     f.update({
1120                         'acodec': acodec,
1121                         'vcodec': vcodec,
1122                     })
1123                 if last_media is not None:
1124                     f['m3u8_media'] = last_media
1125                     last_media = None
1126                 formats.append(f)
1127                 last_info = {}
1128         self._sort_formats(formats)
1129         return formats
1130
1131     @staticmethod
1132     def _xpath_ns(path, namespace=None):
1133         if not namespace:
1134             return path
1135         out = []
1136         for c in path.split('/'):
1137             if not c or c == '.':
1138                 out.append(c)
1139             else:
1140                 out.append('{%s}%s' % (namespace, c))
1141         return '/'.join(out)
1142
1143     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1144         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1145
1146         if smil is False:
1147             assert not fatal
1148             return []
1149
1150         namespace = self._parse_smil_namespace(smil)
1151
1152         return self._parse_smil_formats(
1153             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1154
1155     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1156         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1157         if smil is False:
1158             return {}
1159         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1160
1161     def _download_smil(self, smil_url, video_id, fatal=True):
1162         return self._download_xml(
1163             smil_url, video_id, 'Downloading SMIL file',
1164             'Unable to download SMIL file', fatal=fatal)
1165
1166     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1167         namespace = self._parse_smil_namespace(smil)
1168
1169         formats = self._parse_smil_formats(
1170             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1171         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1172
1173         video_id = os.path.splitext(url_basename(smil_url))[0]
1174         title = None
1175         description = None
1176         upload_date = None
1177         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1178             name = meta.attrib.get('name')
1179             content = meta.attrib.get('content')
1180             if not name or not content:
1181                 continue
1182             if not title and name == 'title':
1183                 title = content
1184             elif not description and name in ('description', 'abstract'):
1185                 description = content
1186             elif not upload_date and name == 'date':
1187                 upload_date = unified_strdate(content)
1188
1189         thumbnails = [{
1190             'id': image.get('type'),
1191             'url': image.get('src'),
1192             'width': int_or_none(image.get('width')),
1193             'height': int_or_none(image.get('height')),
1194         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1195
1196         return {
1197             'id': video_id,
1198             'title': title or video_id,
1199             'description': description,
1200             'upload_date': upload_date,
1201             'thumbnails': thumbnails,
1202             'formats': formats,
1203             'subtitles': subtitles,
1204         }
1205
1206     def _parse_smil_namespace(self, smil):
1207         return self._search_regex(
1208             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1209
1210     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1211         base = smil_url
1212         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1213             b = meta.get('base') or meta.get('httpBase')
1214             if b:
1215                 base = b
1216                 break
1217
1218         formats = []
1219         rtmp_count = 0
1220         http_count = 0
1221         m3u8_count = 0
1222
1223         srcs = []
1224         videos = smil.findall(self._xpath_ns('.//video', namespace))
1225         for video in videos:
1226             src = video.get('src')
1227             if not src or src in srcs:
1228                 continue
1229             srcs.append(src)
1230
1231             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1232             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1233             width = int_or_none(video.get('width'))
1234             height = int_or_none(video.get('height'))
1235             proto = video.get('proto')
1236             ext = video.get('ext')
1237             src_ext = determine_ext(src)
1238             streamer = video.get('streamer') or base
1239
1240             if proto == 'rtmp' or streamer.startswith('rtmp'):
1241                 rtmp_count += 1
1242                 formats.append({
1243                     'url': streamer,
1244                     'play_path': src,
1245                     'ext': 'flv',
1246                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1247                     'tbr': bitrate,
1248                     'filesize': filesize,
1249                     'width': width,
1250                     'height': height,
1251                 })
1252                 if transform_rtmp_url:
1253                     streamer, src = transform_rtmp_url(streamer, src)
1254                     formats[-1].update({
1255                         'url': streamer,
1256                         'play_path': src,
1257                     })
1258                 continue
1259
1260             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1261             src_url = src_url.strip()
1262
1263             if proto == 'm3u8' or src_ext == 'm3u8':
1264                 m3u8_formats = self._extract_m3u8_formats(
1265                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1266                 if len(m3u8_formats) == 1:
1267                     m3u8_count += 1
1268                     m3u8_formats[0].update({
1269                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1270                         'tbr': bitrate,
1271                         'width': width,
1272                         'height': height,
1273                     })
1274                 formats.extend(m3u8_formats)
1275                 continue
1276
1277             if src_ext == 'f4m':
1278                 f4m_url = src_url
1279                 if not f4m_params:
1280                     f4m_params = {
1281                         'hdcore': '3.2.0',
1282                         'plugin': 'flowplayer-3.2.0.1',
1283                     }
1284                 f4m_url += '&' if '?' in f4m_url else '?'
1285                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1286                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1287                 continue
1288
1289             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1290                 http_count += 1
1291                 formats.append({
1292                     'url': src_url,
1293                     'ext': ext or src_ext or 'flv',
1294                     'format_id': 'http-%d' % (bitrate or http_count),
1295                     'tbr': bitrate,
1296                     'filesize': filesize,
1297                     'width': width,
1298                     'height': height,
1299                 })
1300                 continue
1301
1302         self._sort_formats(formats)
1303
1304         return formats
1305
1306     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1307         urls = []
1308         subtitles = {}
1309         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1310             src = textstream.get('src')
1311             if not src or src in urls:
1312                 continue
1313             urls.append(src)
1314             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1315             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1316             subtitles.setdefault(lang, []).append({
1317                 'url': src,
1318                 'ext': ext,
1319             })
1320         return subtitles
1321
1322     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1323         xspf = self._download_xml(
1324             playlist_url, playlist_id, 'Downloading xpsf playlist',
1325             'Unable to download xspf manifest', fatal=fatal)
1326         if xspf is False:
1327             return []
1328         return self._parse_xspf(xspf, playlist_id)
1329
1330     def _parse_xspf(self, playlist, playlist_id):
1331         NS_MAP = {
1332             'xspf': 'http://xspf.org/ns/0/',
1333             's1': 'http://static.streamone.nl/player/ns/0',
1334         }
1335
1336         entries = []
1337         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1338             title = xpath_text(
1339                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1340             description = xpath_text(
1341                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1342             thumbnail = xpath_text(
1343                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1344             duration = float_or_none(
1345                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1346
1347             formats = [{
1348                 'url': location.text,
1349                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1350                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1351                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1352             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1353             self._sort_formats(formats)
1354
1355             entries.append({
1356                 'id': playlist_id,
1357                 'title': title,
1358                 'description': description,
1359                 'thumbnail': thumbnail,
1360                 'duration': duration,
1361                 'formats': formats,
1362             })
1363         return entries
1364
1365     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1366         res = self._download_webpage_handle(
1367             mpd_url, video_id,
1368             note=note or 'Downloading MPD manifest',
1369             errnote=errnote or 'Failed to download MPD manifest',
1370             fatal=fatal)
1371         if res is False:
1372             return []
1373         mpd, urlh = res
1374         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1375
1376         return self._parse_mpd_formats(
1377             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1378
1379     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1380         if mpd_doc.get('type') == 'dynamic':
1381             return []
1382
1383         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1384
1385         def _add_ns(path):
1386             return self._xpath_ns(path, namespace)
1387
1388         def is_drm_protected(element):
1389             return element.find(_add_ns('ContentProtection')) is not None
1390
1391         def extract_multisegment_info(element, ms_parent_info):
1392             ms_info = ms_parent_info.copy()
1393             segment_list = element.find(_add_ns('SegmentList'))
1394             if segment_list is not None:
1395                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1396                 if segment_urls_e:
1397                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1398                 initialization = segment_list.find(_add_ns('Initialization'))
1399                 if initialization is not None:
1400                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1401             else:
1402                 segment_template = element.find(_add_ns('SegmentTemplate'))
1403                 if segment_template is not None:
1404                     start_number = segment_template.get('startNumber')
1405                     if start_number:
1406                         ms_info['start_number'] = int(start_number)
1407                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1408                     if segment_timeline is not None:
1409                         s_e = segment_timeline.findall(_add_ns('S'))
1410                         if s_e:
1411                             ms_info['total_number'] = 0
1412                             for s in s_e:
1413                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1414                     else:
1415                         timescale = segment_template.get('timescale')
1416                         if timescale:
1417                             ms_info['timescale'] = int(timescale)
1418                         segment_duration = segment_template.get('duration')
1419                         if segment_duration:
1420                             ms_info['segment_duration'] = int(segment_duration)
1421                     media_template = segment_template.get('media')
1422                     if media_template:
1423                         ms_info['media_template'] = media_template
1424                     initialization = segment_template.get('initialization')
1425                     if initialization:
1426                         ms_info['initialization_url'] = initialization
1427                     else:
1428                         initialization = segment_template.find(_add_ns('Initialization'))
1429                         if initialization is not None:
1430                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1431             return ms_info
1432
1433         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1434         formats = []
1435         for period in mpd_doc.findall(_add_ns('Period')):
1436             period_duration = parse_duration(period.get('duration')) or mpd_duration
1437             period_ms_info = extract_multisegment_info(period, {
1438                 'start_number': 1,
1439                 'timescale': 1,
1440             })
1441             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1442                 if is_drm_protected(adaptation_set):
1443                     continue
1444                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1445                 for representation in adaptation_set.findall(_add_ns('Representation')):
1446                     if is_drm_protected(representation):
1447                         continue
1448                     representation_attrib = adaptation_set.attrib.copy()
1449                     representation_attrib.update(representation.attrib)
1450                     mime_type = representation_attrib.get('mimeType')
1451                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1452                     if content_type == 'text':
1453                         # TODO implement WebVTT downloading
1454                         pass
1455                     elif content_type == 'video' or content_type == 'audio':
1456                         base_url = ''
1457                         for element in (representation, adaptation_set, period, mpd_doc):
1458                             base_url_e = element.find(_add_ns('BaseURL'))
1459                             if base_url_e is not None:
1460                                 base_url = base_url_e.text + base_url
1461                                 if re.match(r'^https?://', base_url):
1462                                     break
1463                         if mpd_base_url and not re.match(r'^https?://', base_url):
1464                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1465                                 mpd_base_url += '/'
1466                             base_url = mpd_base_url + base_url
1467                         representation_id = representation_attrib.get('id')
1468                         lang = representation_attrib.get('lang')
1469                         url_el = representation.find(_add_ns('BaseURL'))
1470                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1471                         f = {
1472                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1473                             'url': base_url,
1474                             'width': int_or_none(representation_attrib.get('width')),
1475                             'height': int_or_none(representation_attrib.get('height')),
1476                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1477                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1478                             'fps': int_or_none(representation_attrib.get('frameRate')),
1479                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1480                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1481                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1482                             'format_note': 'DASH %s' % content_type,
1483                             'filesize': filesize,
1484                         }
1485                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1486                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1487                             if 'total_number' not in representation_ms_info and 'segment_duration':
1488                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1489                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1490                             media_template = representation_ms_info['media_template']
1491                             media_template = media_template.replace('$RepresentationID$', representation_id)
1492                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1493                             media_template.replace('$$', '$')
1494                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1495                         if 'segment_urls' in representation_ms_info:
1496                             f.update({
1497                                 'segment_urls': representation_ms_info['segment_urls'],
1498                                 'protocol': 'http_dash_segments',
1499                             })
1500                             if 'initialization_url' in representation_ms_info:
1501                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1502                                 f.update({
1503                                     'initialization_url': initialization_url,
1504                                 })
1505                                 if not f.get('url'):
1506                                     f['url'] = initialization_url
1507                         try:
1508                             existing_format = next(
1509                                 fo for fo in formats
1510                                 if fo['format_id'] == representation_id)
1511                         except StopIteration:
1512                             full_info = formats_dict.get(representation_id, {}).copy()
1513                             full_info.update(f)
1514                             formats.append(full_info)
1515                         else:
1516                             existing_format.update(f)
1517                     else:
1518                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1519         self._sort_formats(formats)
1520         return formats
1521
1522     def _live_title(self, name):
1523         """ Generate the title for a live video """
1524         now = datetime.datetime.now()
1525         now_str = now.strftime('%Y-%m-%d %H:%M')
1526         return name + ' ' + now_str
1527
1528     def _int(self, v, name, fatal=False, **kwargs):
1529         res = int_or_none(v, **kwargs)
1530         if 'get_attr' in kwargs:
1531             print(getattr(v, kwargs['get_attr']))
1532         if res is None:
1533             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1534             if fatal:
1535                 raise ExtractorError(msg)
1536             else:
1537                 self._downloader.report_warning(msg)
1538         return res
1539
1540     def _float(self, v, name, fatal=False, **kwargs):
1541         res = float_or_none(v, **kwargs)
1542         if res is None:
1543             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1544             if fatal:
1545                 raise ExtractorError(msg)
1546             else:
1547                 self._downloader.report_warning(msg)
1548         return res
1549
1550     def _set_cookie(self, domain, name, value, expire_time=None):
1551         cookie = compat_cookiejar.Cookie(
1552             0, name, value, None, None, domain, None,
1553             None, '/', True, False, expire_time, '', None, None, None)
1554         self._downloader.cookiejar.set_cookie(cookie)
1555
1556     def _get_cookies(self, url):
1557         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1558         req = sanitized_Request(url)
1559         self._downloader.cookiejar.add_cookie_header(req)
1560         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1561
1562     def get_testcases(self, include_onlymatching=False):
1563         t = getattr(self, '_TEST', None)
1564         if t:
1565             assert not hasattr(self, '_TESTS'), \
1566                 '%s has _TEST and _TESTS' % type(self).__name__
1567             tests = [t]
1568         else:
1569             tests = getattr(self, '_TESTS', [])
1570         for t in tests:
1571             if not include_onlymatching and t.get('only_matching', False):
1572                 continue
1573             t['name'] = type(self).__name__[:-len('IE')]
1574             yield t
1575
1576     def is_suitable(self, age_limit):
1577         """ Test whether the extractor is generally suitable for the given
1578         age limit (i.e. pornographic sites are not, all others usually are) """
1579
1580         any_restricted = False
1581         for tc in self.get_testcases(include_onlymatching=False):
1582             if 'playlist' in tc:
1583                 tc = tc['playlist'][0]
1584             is_restricted = age_restricted(
1585                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1586             if not is_restricted:
1587                 return True
1588             any_restricted = any_restricted or is_restricted
1589         return not any_restricted
1590
1591     def extract_subtitles(self, *args, **kwargs):
1592         if (self._downloader.params.get('writesubtitles', False) or
1593                 self._downloader.params.get('listsubtitles')):
1594             return self._get_subtitles(*args, **kwargs)
1595         return {}
1596
1597     def _get_subtitles(self, *args, **kwargs):
1598         raise NotImplementedError('This method must be implemented by subclasses')
1599
1600     @staticmethod
1601     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1602         """ Merge subtitle items for one language. Items with duplicated URLs
1603         will be dropped. """
1604         list1_urls = set([item['url'] for item in subtitle_list1])
1605         ret = list(subtitle_list1)
1606         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1607         return ret
1608
1609     @classmethod
1610     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1611         """ Merge two subtitle dictionaries, language by language. """
1612         ret = dict(subtitle_dict1)
1613         for lang in subtitle_dict2:
1614             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1615         return ret
1616
1617     def extract_automatic_captions(self, *args, **kwargs):
1618         if (self._downloader.params.get('writeautomaticsub', False) or
1619                 self._downloader.params.get('listsubtitles')):
1620             return self._get_automatic_captions(*args, **kwargs)
1621         return {}
1622
1623     def _get_automatic_captions(self, *args, **kwargs):
1624         raise NotImplementedError('This method must be implemented by subclasses')
1625
1626     def mark_watched(self, *args, **kwargs):
1627         if (self._downloader.params.get('mark_watched', False) and
1628                 (self._get_login_info()[0] is not None or
1629                     self._downloader.params.get('cookiefile') is not None)):
1630             self._mark_watched(*args, **kwargs)
1631
1632     def _mark_watched(self, *args, **kwargs):
1633         raise NotImplementedError('This method must be implemented by subclasses')
1634
1635
1636 class SearchInfoExtractor(InfoExtractor):
1637     """
1638     Base class for paged search queries extractors.
1639     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1640     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1641     """
1642
1643     @classmethod
1644     def _make_valid_url(cls):
1645         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1646
1647     @classmethod
1648     def suitable(cls, url):
1649         return re.match(cls._make_valid_url(), url) is not None
1650
1651     def _real_extract(self, query):
1652         mobj = re.match(self._make_valid_url(), query)
1653         if mobj is None:
1654             raise ExtractorError('Invalid search query "%s"' % query)
1655
1656         prefix = mobj.group('prefix')
1657         query = mobj.group('query')
1658         if prefix == '':
1659             return self._get_n_results(query, 1)
1660         elif prefix == 'all':
1661             return self._get_n_results(query, self._MAX_RESULTS)
1662         else:
1663             n = int(prefix)
1664             if n <= 0:
1665                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1666             elif n > self._MAX_RESULTS:
1667                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1668                 n = self._MAX_RESULTS
1669             return self._get_n_results(query, n)
1670
1671     def _get_n_results(self, query, n):
1672         """Get a specified number of results for a query"""
1673         raise NotImplementedError('This method must be implemented by subclasses')
1674
1675     @property
1676     def SEARCH_KEY(self):
1677         return self._SEARCH_KEY