_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51     update_url_query,
  52 )
  53
  54
  55 class InfoExtractor(object):
  56     """Information Extractor class.
  57
  58     Information extractors are the classes that, given a URL, extract
  59     information about the video (or videos) the URL refers to. This
  60     information includes the real video URL, the video title, author and
  61     others. The information is stored in a dictionary which is then
  62     passed to the YoutubeDL. The YoutubeDL processes this
  63     information possibly downloading the video to the file system, among
  64     other possible outcomes.
  65
  66     The type field determines the type of the result.
  67     By far the most common value (and the default if _type is missing) is
  68     "video", which indicates a single video.
  69
  70     For a video, the dictionaries must include the following fields:
  71
  72     id:             Video identifier.
  73     title:          Video title, unescaped.
  74
  75     Additionally, it must contain either a formats entry or a url one:
  76
  77     formats:        A list of dictionaries for each format available, ordered
  78                     from worst to best quality.
  79
  80                     Potential fields:
  81                     * url        Mandatory. The URL of the video file
  82                     * ext        Will be calculated from URL if missing
  83                     * format     A human-readable description of the format
  84                                  ("mp4 container with h264/opus").
  85                                  Calculated from the format_id, width, height.
  86                                  and format_note fields if missing.
  87                     * format_id  A short description of the format
  88                                  ("mp4_h264_opus" or "19").
  89                                 Technically optional, but strongly recommended.
  90                     * format_note Additional info about the format
  91                                  ("3D" or "DASH video")
  92                     * width      Width of the video, if known
  93                     * height     Height of the video, if known
  94                     * resolution Textual description of width and height
  95                     * tbr        Average bitrate of audio and video in KBit/s
  96                     * abr        Average audio bitrate in KBit/s
  97                     * acodec     Name of the audio codec in use
  98                     * asr        Audio sampling rate in Hertz
  99                     * vbr        Average video bitrate in KBit/s
 100                     * fps        Frame rate
 101                     * vcodec     Name of the video codec in use
 102                     * container  Name of the container format
 103                     * filesize   The number of bytes, if known in advance
 104                     * filesize_approx  An estimate for the number of bytes
 105                     * player_url SWF Player URL (used for rtmpdump).
 106                     * protocol   The protocol that will be used for the actual
 107                                  download, lower-case.
 108                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 109                                  "m3u8", "m3u8_native" or "http_dash_segments".
 110                     * preference Order number of this format. If this field is
 111                                  present and not None, the formats get sorted
 112                                  by this field, regardless of all other values.
 113                                  -1 for default (order by other properties),
 114                                  -2 or smaller for less than default.
 115                                  < -1000 to hide the format (if there is
 116                                     another one which is strictly better)
 117                     * language   Language code, e.g. "de" or "en-US".
 118                     * language_preference  Is this in the language mentioned in
 119                                  the URL?
 120                                  10 if it's what the URL is about,
 121                                  -1 for default (don't know),
 122                                  -10 otherwise, other values reserved for now.
 123                     * quality    Order number of the video quality of this
 124                                  format, irrespective of the file format.
 125                                  -1 for default (order by other properties),
 126                                  -2 or smaller for less than default.
 127                     * source_preference  Order number for this video source
 128                                   (quality takes higher priority)
 129                                  -1 for default (order by other properties),
 130                                  -2 or smaller for less than default.
 131                     * http_headers  A dictionary of additional HTTP headers
 132                                  to add to the request.
 133                     * stretched_ratio  If given and not 1, indicates that the
 134                                  video's pixels are not square.
 135                                  width : height ratio as float.
 136                     * no_resume  The server does not support resuming the
 137                                  (HTTP or RTMP) download. Boolean.
 138
 139     url:            Final video URL.
 140     ext:            Video filename extension.
 141     format:         The video format, defaults to ext (used for --get-format)
 142     player_url:     SWF Player URL (used for rtmpdump).
 143
 144     The following fields are optional:
 145
 146     alt_title:      A secondary title of the video.
 147     display_id      An alternative identifier for the video, not necessarily
 148                     unique, but available before title. Typically, id is
 149                     something like "4234987", title "Dancing naked mole rats",
 150                     and display_id "dancing-naked-mole-rats"
 151     thumbnails:     A list of dictionaries, with the following entries:
 152                         * "id" (optional, string) - Thumbnail format ID
 153                         * "url"
 154                         * "preference" (optional, int) - quality of the image
 155                         * "width" (optional, int)
 156                         * "height" (optional, int)
 157                         * "resolution" (optional, string "{width}x{height"},
 158                                         deprecated)
 159     thumbnail:      Full URL to a video thumbnail image.
 160     description:    Full video description.
 161     uploader:       Full name of the video uploader.
 162     license:        License name the video is licensed under.
 163     creator:        The main artist who created the video.
 164     release_date:   The date (YYYYMMDD) when the video was released.
 165     timestamp:      UNIX timestamp of the moment the video became available.
 166     upload_date:    Video upload date (YYYYMMDD).
 167                     If not explicitly set, calculated from timestamp.
 168     uploader_id:    Nickname or id of the video uploader.
 169     uploader_url:   Full URL to a personal webpage of the video uploader.
 170     location:       Physical location where the video was filmed.
 171     subtitles:      The available subtitles as a dictionary in the format
 172                     {language: subformats}. "subformats" is a list sorted from
 173                     lower to higher preference, each element is a dictionary
 174                     with the "ext" entry and one of:
 175                         * "data": The subtitles file contents
 176                         * "url": A URL pointing to the subtitles file
 177                     "ext" will be calculated from URL if missing
 178     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 179                     automatically generated captions
 180     duration:       Length of the video in seconds, as an integer or float.
 181     view_count:     How many users have watched the video on the platform.
 182     like_count:     Number of positive ratings of the video
 183     dislike_count:  Number of negative ratings of the video
 184     repost_count:   Number of reposts of the video
 185     average_rating: Average rating give by users, the scale used depends on the webpage
 186     comment_count:  Number of comments on the video
 187     comments:       A list of comments, each with one or more of the following
 188                     properties (all but one of text or html optional):
 189                         * "author" - human-readable name of the comment author
 190                         * "author_id" - user ID of the comment author
 191                         * "id" - Comment ID
 192                         * "html" - Comment as HTML
 193                         * "text" - Plain text of the comment
 194                         * "timestamp" - UNIX timestamp of comment
 195                         * "parent" - ID of the comment this one is replying to.
 196                                      Set to "root" to indicate that this is a
 197                                      comment to the original video.
 198     age_limit:      Age restriction for the video, as an integer (years)
 199     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 200                     should allow to get the same result again. (It will be set
 201                     by YoutubeDL if it's missing)
 202     categories:     A list of categories that the video falls in, for example
 203                     ["Sports", "Berlin"]
 204     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 205     is_live:        True, False, or None (=unknown). Whether this video is a
 206                     live stream that goes on instead of a fixed-length video.
 207     start_time:     Time in seconds where the reproduction should start, as
 208                     specified in the URL.
 209     end_time:       Time in seconds where the reproduction should end, as
 210                     specified in the URL.
 211
 212     The following fields should only be used when the video belongs to some logical
 213     chapter or section:
 214
 215     chapter:        Name or title of the chapter the video belongs to.
 216     chapter_number: Number of the chapter the video belongs to, as an integer.
 217     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 218
 219     The following fields should only be used when the video is an episode of some
 220     series or programme:
 221
 222     series:         Title of the series or programme the video episode belongs to.
 223     season:         Title of the season the video episode belongs to.
 224     season_number:  Number of the season the video episode belongs to, as an integer.
 225     season_id:      Id of the season the video episode belongs to, as a unicode string.
 226     episode:        Title of the video episode. Unlike mandatory video title field,
 227                     this field should denote the exact title of the video episode
 228                     without any kind of decoration.
 229     episode_number: Number of the video episode within a season, as an integer.
 230     episode_id:     Id of the video episode, as a unicode string.
 231
 232     Unless mentioned otherwise, the fields should be Unicode strings.
 233
 234     Unless mentioned otherwise, None is equivalent to absence of information.
 235
 236
 237     _type "playlist" indicates multiple videos.
 238     There must be a key "entries", which is a list, an iterable, or a PagedList
 239     object, each element of which is a valid dictionary by this specification.
 240
 241     Additionally, playlists can have "title", "description" and "id" attributes
 242     with the same semantics as videos (see above).
 243
 244
 245     _type "multi_video" indicates that there are multiple videos that
 246     form a single show, for examples multiple acts of an opera or TV episode.
 247     It must have an entries key like a playlist and contain all the keys
 248     required for a video at the same time.
 249
 250
 251     _type "url" indicates that the video must be extracted from another
 252     location, possibly by a different extractor. Its only required key is:
 253     "url" - the next URL to extract.
 254     The key "ie_key" can be set to the class name (minus the trailing "IE",
 255     e.g. "Youtube") if the extractor class is known in advance.
 256     Additionally, the dictionary may have any properties of the resolved entity
 257     known in advance, for example "title" if the title of the referred video is
 258     known ahead of time.
 259
 260
 261     _type "url_transparent" entities have the same specification as "url", but
 262     indicate that the given additional information is more precise than the one
 263     associated with the resolved URL.
 264     This is useful when a site employs a video service that hosts the video and
 265     its technical metadata, but that video service does not embed a useful
 266     title, description etc.
 267
 268
 269     Subclasses of this one should re-define the _real_initialize() and
 270     _real_extract() methods and define a _VALID_URL regexp.
 271     Probably, they should also be added to the list of extractors.
 272
 273     Finally, the _WORKING attribute should be set to False for broken IEs
 274     in order to warn the users and skip the tests.
 275     """
 276
 277     _ready = False
 278     _downloader = None
 279     _WORKING = True
 280
 281     def __init__(self, downloader=None):
 282         """Constructor. Receives an optional downloader."""
 283         self._ready = False
 284         self.set_downloader(downloader)
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         """Receives a URL and returns True if suitable for this IE."""
 289
 290         # This does not use has/getattr intentionally - we want to know whether
 291         # we have cached the regexp for *this* class, whereas getattr would also
 292         # match the superclass
 293         if '_VALID_URL_RE' not in cls.__dict__:
 294             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 295         return cls._VALID_URL_RE.match(url) is not None
 296
 297     @classmethod
 298     def _match_id(cls, url):
 299         if '_VALID_URL_RE' not in cls.__dict__:
 300             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 301         m = cls._VALID_URL_RE.match(url)
 302         assert m
 303         return m.group('id')
 304
 305     @classmethod
 306     def working(cls):
 307         """Getter method for _WORKING."""
 308         return cls._WORKING
 309
 310     def initialize(self):
 311         """Initializes an instance (authentication, etc)."""
 312         if not self._ready:
 313             self._real_initialize()
 314             self._ready = True
 315
 316     def extract(self, url):
 317         """Extracts URL information and returns it in list of dicts."""
 318         try:
 319             self.initialize()
 320             return self._real_extract(url)
 321         except ExtractorError:
 322             raise
 323         except compat_http_client.IncompleteRead as e:
 324             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 325         except (KeyError, StopIteration) as e:
 326             raise ExtractorError('An extractor error has occurred.', cause=e)
 327
 328     def set_downloader(self, downloader):
 329         """Sets the downloader for this IE."""
 330         self._downloader = downloader
 331
 332     def _real_initialize(self):
 333         """Real initialization process. Redefine in subclasses."""
 334         pass
 335
 336     def _real_extract(self, url):
 337         """Real extraction process. Redefine in subclasses."""
 338         pass
 339
 340     @classmethod
 341     def ie_key(cls):
 342         """A string for getting the InfoExtractor with get_info_extractor"""
 343         return compat_str(cls.__name__[:-2])
 344
 345     @property
 346     def IE_NAME(self):
 347         return compat_str(type(self).__name__[:-2])
 348
 349     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
 350         """ Returns the response handle """
 351         if note is None:
 352             self.report_download_webpage(video_id)
 353         elif note is not False:
 354             if video_id is None:
 355                 self.to_screen('%s' % (note,))
 356             else:
 357                 self.to_screen('%s: %s' % (video_id, note))
 358         # data, headers and query params will be ignored for `Request` objects
 359         if isinstance(url_or_request, compat_str):
 360             if query:
 361                 url_or_request = update_url_query(url_or_request, query)
 362             if data or headers:
 363                 url_or_request = sanitized_Request(url_or_request, data, headers or {})
 364         try:
 365             return self._downloader.urlopen(url_or_request)
 366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 367             if errnote is False:
 368                 return False
 369             if errnote is None:
 370                 errnote = 'Unable to download webpage'
 371
 372             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 373             if fatal:
 374                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 375             else:
 376                 self._downloader.report_warning(errmsg)
 377                 return False
 378
 379     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 380         """ Returns a tuple (page content as string, URL handle) """
 381         # Strip hashes from the URL (#1038)
 382         if isinstance(url_or_request, (compat_str, str)):
 383             url_or_request = url_or_request.partition('#')[0]
 384
 385         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 386         if urlh is False:
 387             assert not fatal
 388             return False
 389         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 390         return (content, urlh)
 391
 392     @staticmethod
 393     def _guess_encoding_from_content(content_type, webpage_bytes):
 394         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 395         if m:
 396             encoding = m.group(1)
 397         else:
 398             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 399                           webpage_bytes[:1024])
 400             if m:
 401                 encoding = m.group(1).decode('ascii')
 402             elif webpage_bytes.startswith(b'\xff\xfe'):
 403                 encoding = 'utf-16'
 404             else:
 405                 encoding = 'utf-8'
 406
 407         return encoding
 408
 409     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 410         content_type = urlh.headers.get('Content-Type', '')
 411         webpage_bytes = urlh.read()
 412         if prefix is not None:
 413             webpage_bytes = prefix + webpage_bytes
 414         if not encoding:
 415             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 416         if self._downloader.params.get('dump_intermediate_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             self.to_screen('Dumping request to ' + url)
 422             dump = base64.b64encode(webpage_bytes).decode('ascii')
 423             self._downloader.to_screen(dump)
 424         if self._downloader.params.get('write_pages', False):
 425             try:
 426                 url = url_or_request.get_full_url()
 427             except AttributeError:
 428                 url = url_or_request
 429             basen = '%s_%s' % (video_id, url)
 430             if len(basen) > 240:
 431                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 432                 basen = basen[:240 - len(h)] + h
 433             raw_filename = basen + '.dump'
 434             filename = sanitize_filename(raw_filename, restricted=True)
 435             self.to_screen('Saving request to ' + filename)
 436             # Working around MAX_PATH limitation on Windows (see
 437             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 438             if compat_os_name == 'nt':
 439                 absfilepath = os.path.abspath(filename)
 440                 if len(absfilepath) > 259:
 441                     filename = '\\\\?\\' + absfilepath
 442             with open(filename, 'wb') as outf:
 443                 outf.write(webpage_bytes)
 444
 445         try:
 446             content = webpage_bytes.decode(encoding, 'replace')
 447         except LookupError:
 448             content = webpage_bytes.decode('utf-8', 'replace')
 449
 450         if ('<title>Access to this site is blocked</title>' in content and
 451                 'Websense' in content[:512]):
 452             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 453             blocked_iframe = self._html_search_regex(
 454                 r'<iframe src="([^"]+)"', content,
 455                 'Websense information URL', default=None)
 456             if blocked_iframe:
 457                 msg += ' Visit %s for more details' % blocked_iframe
 458             raise ExtractorError(msg, expected=True)
 459         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 460             msg = (
 461                 'Access to this webpage has been blocked by Indian censorship. '
 462                 'Use a VPN or proxy server (with --proxy) to route around it.')
 463             block_msg = self._html_search_regex(
 464                 r'</h1><p>(.*?)</p>',
 465                 content, 'block message', default=None)
 466             if block_msg:
 467                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 468             raise ExtractorError(msg, expected=True)
 469
 470         return content
 471
 472     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
 473         """ Returns the data of the page as a string """
 474         success = False
 475         try_count = 0
 476         while success is False:
 477             try:
 478                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 479                 success = True
 480             except compat_http_client.IncompleteRead as e:
 481                 try_count += 1
 482                 if try_count >= tries:
 483                     raise e
 484                 self._sleep(timeout, video_id)
 485         if res is False:
 486             return res
 487         else:
 488             content, _ = res
 489             return content
 490
 491     def _download_xml(self, url_or_request, video_id,
 492                       note='Downloading XML', errnote='Unable to download XML',
 493                       transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 494         """Return the xml as an xml.etree.ElementTree.Element"""
 495         xml_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 497         if xml_string is False:
 498             return xml_string
 499         if transform_source:
 500             xml_string = transform_source(xml_string)
 501         return compat_etree_fromstring(xml_string.encode('utf-8'))
 502
 503     def _download_json(self, url_or_request, video_id,
 504                        note='Downloading JSON metadata',
 505                        errnote='Unable to download JSON metadata',
 506                        transform_source=None,
 507                        fatal=True, encoding=None, data=None, headers=None, query=None):
 508         json_string = self._download_webpage(
 509             url_or_request, video_id, note, errnote, fatal=fatal,
 510             encoding=encoding, data=data, headers=headers, query=query)
 511         if (not fatal) and json_string is False:
 512             return None
 513         return self._parse_json(
 514             json_string, video_id, transform_source=transform_source, fatal=fatal)
 515
 516     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 517         if transform_source:
 518             json_string = transform_source(json_string)
 519         try:
 520             return json.loads(json_string)
 521         except ValueError as ve:
 522             errmsg = '%s: Failed to parse JSON ' % video_id
 523             if fatal:
 524                 raise ExtractorError(errmsg, cause=ve)
 525             else:
 526                 self.report_warning(errmsg + str(ve))
 527
 528     def report_warning(self, msg, video_id=None):
 529         idstr = '' if video_id is None else '%s: ' % video_id
 530         self._downloader.report_warning(
 531             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 532
 533     def to_screen(self, msg):
 534         """Print msg to screen, prefixing it with '[ie_name]'"""
 535         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 536
 537     def report_extraction(self, id_or_name):
 538         """Report information extraction."""
 539         self.to_screen('%s: Extracting information' % id_or_name)
 540
 541     def report_download_webpage(self, video_id):
 542         """Report webpage download."""
 543         self.to_screen('%s: Downloading webpage' % video_id)
 544
 545     def report_age_confirmation(self):
 546         """Report attempt to confirm age."""
 547         self.to_screen('Confirming age')
 548
 549     def report_login(self):
 550         """Report attempt to log in."""
 551         self.to_screen('Logging in')
 552
 553     @staticmethod
 554     def raise_login_required(msg='This video is only available for registered users'):
 555         raise ExtractorError(
 556             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 557             expected=True)
 558
 559     @staticmethod
 560     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 561         raise ExtractorError(
 562             '%s. You might want to use --proxy to workaround.' % msg,
 563             expected=True)
 564
 565     # Methods for following #608
 566     @staticmethod
 567     def url_result(url, ie=None, video_id=None, video_title=None):
 568         """Returns a URL that points to a page that should be processed"""
 569         # TODO: ie should be the class used for getting the info
 570         video_info = {'_type': 'url',
 571                       'url': url,
 572                       'ie_key': ie}
 573         if video_id is not None:
 574             video_info['id'] = video_id
 575         if video_title is not None:
 576             video_info['title'] = video_title
 577         return video_info
 578
 579     @staticmethod
 580     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 581         """Returns a playlist"""
 582         video_info = {'_type': 'playlist',
 583                       'entries': entries}
 584         if playlist_id:
 585             video_info['id'] = playlist_id
 586         if playlist_title:
 587             video_info['title'] = playlist_title
 588         if playlist_description:
 589             video_info['description'] = playlist_description
 590         return video_info
 591
 592     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Perform a regex search on the given string, using a single or a list of
 595         patterns returning the first matching group.
 596         In case of failure return a default value or raise a WARNING or a
 597         RegexNotFoundError, depending on fatal, specifying the field name.
 598         """
 599         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 600             mobj = re.search(pattern, string, flags)
 601         else:
 602             for p in pattern:
 603                 mobj = re.search(p, string, flags)
 604                 if mobj:
 605                     break
 606
 607         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 608             _name = '\033[0;34m%s\033[0m' % name
 609         else:
 610             _name = name
 611
 612         if mobj:
 613             if group is None:
 614                 # return the first matching group
 615                 return next(g for g in mobj.groups() if g is not None)
 616             else:
 617                 return mobj.group(group)
 618         elif default is not NO_DEFAULT:
 619             return default
 620         elif fatal:
 621             raise RegexNotFoundError('Unable to extract %s' % _name)
 622         else:
 623             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 624             return None
 625
 626     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 627         """
 628         Like _search_regex, but strips HTML tags and unescapes entities.
 629         """
 630         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 631         if res:
 632             return clean_html(res).strip()
 633         else:
 634             return res
 635
 636     def _get_login_info(self):
 637         """
 638         Get the login info as (username, password)
 639         It will look in the netrc file using the _NETRC_MACHINE value
 640         If there's no info available, return (None, None)
 641         """
 642         if self._downloader is None:
 643             return (None, None)
 644
 645         username = None
 646         password = None
 647         downloader_params = self._downloader.params
 648
 649         # Attempt to use provided username and password or .netrc data
 650         if downloader_params.get('username') is not None:
 651             username = downloader_params['username']
 652             password = downloader_params['password']
 653         elif downloader_params.get('usenetrc', False):
 654             try:
 655                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 656                 if info is not None:
 657                     username = info[0]
 658                     password = info[2]
 659                 else:
 660                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 661             except (IOError, netrc.NetrcParseError) as err:
 662                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 663
 664         return (username, password)
 665
 666     def _get_tfa_info(self, note='two-factor verification code'):
 667         """
 668         Get the two-factor authentication info
 669         TODO - asking the user will be required for sms/phone verify
 670         currently just uses the command line option
 671         If there's no info available, return None
 672         """
 673         if self._downloader is None:
 674             return None
 675         downloader_params = self._downloader.params
 676
 677         if downloader_params.get('twofactor') is not None:
 678             return downloader_params['twofactor']
 679
 680         return compat_getpass('Type %s and press [Return]: ' % note)
 681
 682     # Helper functions for extracting OpenGraph info
 683     @staticmethod
 684     def _og_regexes(prop):
 685         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 686         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 687                        % {'prop': re.escape(prop)})
 688         template = r'<meta[^>]+?%s[^>]+?%s'
 689         return [
 690             template % (property_re, content_re),
 691             template % (content_re, property_re),
 692         ]
 693
 694     @staticmethod
 695     def _meta_regex(prop):
 696         return r'''(?isx)<meta
 697                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 698                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 699
 700     def _og_search_property(self, prop, html, name=None, **kargs):
 701         if name is None:
 702             name = 'OpenGraph %s' % prop
 703         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 704         if escaped is None:
 705             return None
 706         return unescapeHTML(escaped)
 707
 708     def _og_search_thumbnail(self, html, **kargs):
 709         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 710
 711     def _og_search_description(self, html, **kargs):
 712         return self._og_search_property('description', html, fatal=False, **kargs)
 713
 714     def _og_search_title(self, html, **kargs):
 715         return self._og_search_property('title', html, **kargs)
 716
 717     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 718         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 719         if secure:
 720             regexes = self._og_regexes('video:secure_url') + regexes
 721         return self._html_search_regex(regexes, html, name, **kargs)
 722
 723     def _og_search_url(self, html, **kargs):
 724         return self._og_search_property('url', html, **kargs)
 725
 726     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 727         if display_name is None:
 728             display_name = name
 729         return self._html_search_regex(
 730             self._meta_regex(name),
 731             html, display_name, fatal=fatal, group='content', **kwargs)
 732
 733     def _dc_search_uploader(self, html):
 734         return self._html_search_meta('dc.creator', html, 'uploader')
 735
 736     def _rta_search(self, html):
 737         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 738         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 739                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 740                      html):
 741             return 18
 742         return 0
 743
 744     def _media_rating_search(self, html):
 745         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 746         rating = self._html_search_meta('rating', html)
 747
 748         if not rating:
 749             return None
 750
 751         RATING_TABLE = {
 752             'safe for kids': 0,
 753             'general': 8,
 754             '14 years': 14,
 755             'mature': 17,
 756             'restricted': 19,
 757         }
 758         return RATING_TABLE.get(rating.lower())
 759
 760     def _family_friendly_search(self, html):
 761         # See http://schema.org/VideoObject
 762         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 763
 764         if not family_friendly:
 765             return None
 766
 767         RATING_TABLE = {
 768             '1': 0,
 769             'true': 0,
 770             '0': 18,
 771             'false': 18,
 772         }
 773         return RATING_TABLE.get(family_friendly.lower())
 774
 775     def _twitter_search_player(self, html):
 776         return self._html_search_meta('twitter:player', html,
 777                                       'twitter card player')
 778
 779     def _search_json_ld(self, html, video_id, **kwargs):
 780         json_ld = self._search_regex(
 781             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 782             html, 'JSON-LD', group='json_ld', **kwargs)
 783         if not json_ld:
 784             return {}
 785         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 786
 787     def _json_ld(self, json_ld, video_id, fatal=True):
 788         if isinstance(json_ld, compat_str):
 789             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 790         if not json_ld:
 791             return {}
 792         info = {}
 793         if json_ld.get('@context') == 'http://schema.org':
 794             item_type = json_ld.get('@type')
 795             if item_type == 'TVEpisode':
 796                 info.update({
 797                     'episode': unescapeHTML(json_ld.get('name')),
 798                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 799                     'description': unescapeHTML(json_ld.get('description')),
 800                 })
 801                 part_of_season = json_ld.get('partOfSeason')
 802                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 803                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 804                 part_of_series = json_ld.get('partOfSeries')
 805                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 806                     info['series'] = unescapeHTML(part_of_series.get('name'))
 807             elif item_type == 'Article':
 808                 info.update({
 809                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 810                     'title': unescapeHTML(json_ld.get('headline')),
 811                     'description': unescapeHTML(json_ld.get('articleBody')),
 812                 })
 813         return dict((k, v) for k, v in info.items() if v is not None)
 814
 815     @staticmethod
 816     def _hidden_inputs(html):
 817         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 818         hidden_inputs = {}
 819         for input in re.findall(r'(?i)<input([^>]+)>', html):
 820             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 821                 continue
 822             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 823             if not name:
 824                 continue
 825             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 826             if not value:
 827                 continue
 828             hidden_inputs[name.group('value')] = value.group('value')
 829         return hidden_inputs
 830
 831     def _form_hidden_inputs(self, form_id, html):
 832         form = self._search_regex(
 833             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 834             html, '%s form' % form_id, group='form')
 835         return self._hidden_inputs(form)
 836
 837     def _sort_formats(self, formats, field_preference=None):
 838         if not formats:
 839             raise ExtractorError('No video formats found')
 840
 841         for f in formats:
 842             # Automatically determine tbr when missing based on abr and vbr (improves
 843             # formats sorting in some cases)
 844             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 845                 f['tbr'] = f['abr'] + f['vbr']
 846
 847         def _formats_key(f):
 848             # TODO remove the following workaround
 849             from ..utils import determine_ext
 850             if not f.get('ext') and 'url' in f:
 851                 f['ext'] = determine_ext(f['url'])
 852
 853             if isinstance(field_preference, (list, tuple)):
 854                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 855
 856             preference = f.get('preference')
 857             if preference is None:
 858                 preference = 0
 859                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 860                     preference -= 0.5
 861
 862             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 863
 864             if f.get('vcodec') == 'none':  # audio only
 865                 if self._downloader.params.get('prefer_free_formats'):
 866                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 867                 else:
 868                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 869                 ext_preference = 0
 870                 try:
 871                     audio_ext_preference = ORDER.index(f['ext'])
 872                 except ValueError:
 873                     audio_ext_preference = -1
 874             else:
 875                 if self._downloader.params.get('prefer_free_formats'):
 876                     ORDER = ['flv', 'mp4', 'webm']
 877                 else:
 878                     ORDER = ['webm', 'flv', 'mp4']
 879                 try:
 880                     ext_preference = ORDER.index(f['ext'])
 881                 except ValueError:
 882                     ext_preference = -1
 883                 audio_ext_preference = 0
 884
 885             return (
 886                 preference,
 887                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 888                 f.get('quality') if f.get('quality') is not None else -1,
 889                 f.get('tbr') if f.get('tbr') is not None else -1,
 890                 f.get('filesize') if f.get('filesize') is not None else -1,
 891                 f.get('vbr') if f.get('vbr') is not None else -1,
 892                 f.get('height') if f.get('height') is not None else -1,
 893                 f.get('width') if f.get('width') is not None else -1,
 894                 proto_preference,
 895                 ext_preference,
 896                 f.get('abr') if f.get('abr') is not None else -1,
 897                 audio_ext_preference,
 898                 f.get('fps') if f.get('fps') is not None else -1,
 899                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 900                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 901                 f.get('format_id') if f.get('format_id') is not None else '',
 902             )
 903         formats.sort(key=_formats_key)
 904
 905     def _check_formats(self, formats, video_id):
 906         if formats:
 907             formats[:] = filter(
 908                 lambda f: self._is_valid_url(
 909                     f['url'], video_id,
 910                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 911                 formats)
 912
 913     @staticmethod
 914     def _remove_duplicate_formats(formats):
 915         format_urls = set()
 916         unique_formats = []
 917         for f in formats:
 918             if f['url'] not in format_urls:
 919                 format_urls.add(f['url'])
 920                 unique_formats.append(f)
 921         formats[:] = unique_formats
 922
 923     def _is_valid_url(self, url, video_id, item='video'):
 924         url = self._proto_relative_url(url, scheme='http:')
 925         # For now assume non HTTP(S) URLs always valid
 926         if not (url.startswith('http://') or url.startswith('https://')):
 927             return True
 928         try:
 929             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 930             return True
 931         except ExtractorError as e:
 932             if isinstance(e.cause, compat_urllib_error.URLError):
 933                 self.to_screen(
 934                     '%s: %s URL is invalid, skipping' % (video_id, item))
 935                 return False
 936             raise
 937
 938     def http_scheme(self):
 939         """ Either "http:" or "https:", depending on the user's preferences """
 940         return (
 941             'http:'
 942             if self._downloader.params.get('prefer_insecure', False)
 943             else 'https:')
 944
 945     def _proto_relative_url(self, url, scheme=None):
 946         if url is None:
 947             return url
 948         if url.startswith('//'):
 949             if scheme is None:
 950                 scheme = self.http_scheme()
 951             return scheme + url
 952         else:
 953             return url
 954
 955     def _sleep(self, timeout, video_id, msg_template=None):
 956         if msg_template is None:
 957             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 958         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 959         self.to_screen(msg)
 960         time.sleep(timeout)
 961
 962     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 963                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 964                              fatal=True):
 965         manifest = self._download_xml(
 966             manifest_url, video_id, 'Downloading f4m manifest',
 967             'Unable to download f4m manifest',
 968             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 969             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 970             transform_source=transform_source,
 971             fatal=fatal)
 972
 973         if manifest is False:
 974             return []
 975
 976         return self._parse_f4m_formats(
 977             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 978             transform_source=transform_source, fatal=fatal)
 979
 980     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 981                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 982                            fatal=True):
 983         formats = []
 984         manifest_version = '1.0'
 985         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 986         if not media_nodes:
 987             manifest_version = '2.0'
 988             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 989         base_url = xpath_text(
 990             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 991             'base URL', default=None)
 992         if base_url:
 993             base_url = base_url.strip()
 994         for i, media_el in enumerate(media_nodes):
 995             if manifest_version == '2.0':
 996                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 997                 if not media_url:
 998                     continue
 999                 manifest_url = (
1000                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1001                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1002                 # If media_url is itself a f4m manifest do the recursive extraction
1003                 # since bitrates in parent manifest (this one) and media_url manifest
1004                 # may differ leading to inability to resolve the format by requested
1005                 # bitrate in f4m downloader
1006                 if determine_ext(manifest_url) == 'f4m':
1007                     formats.extend(self._extract_f4m_formats(
1008                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1009                         transform_source=transform_source, fatal=fatal))
1010                     continue
1011             tbr = int_or_none(media_el.attrib.get('bitrate'))
1012             formats.append({
1013                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1014                 'url': manifest_url,
1015                 'ext': 'flv',
1016                 'tbr': tbr,
1017                 'width': int_or_none(media_el.attrib.get('width')),
1018                 'height': int_or_none(media_el.attrib.get('height')),
1019                 'preference': preference,
1020             })
1021         self._sort_formats(formats)
1022
1023         return formats
1024
1025     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1026                               entry_protocol='m3u8', preference=None,
1027                               m3u8_id=None, note=None, errnote=None,
1028                               fatal=True):
1029
1030         formats = [{
1031             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1032             'url': m3u8_url,
1033             'ext': ext,
1034             'protocol': 'm3u8',
1035             'preference': preference - 1 if preference else -1,
1036             'resolution': 'multiple',
1037             'format_note': 'Quality selection URL',
1038         }]
1039
1040         format_url = lambda u: (
1041             u
1042             if re.match(r'^https?://', u)
1043             else compat_urlparse.urljoin(m3u8_url, u))
1044
1045         res = self._download_webpage_handle(
1046             m3u8_url, video_id,
1047             note=note or 'Downloading m3u8 information',
1048             errnote=errnote or 'Failed to download m3u8 information',
1049             fatal=fatal)
1050         if res is False:
1051             return []
1052         m3u8_doc, urlh = res
1053         m3u8_url = urlh.geturl()
1054
1055         # We should try extracting formats only from master playlists [1], i.e.
1056         # playlists that describe available qualities. On the other hand media
1057         # playlists [2] should be returned as is since they contain just the media
1058         # without qualities renditions.
1059         # Fortunately, master playlist can be easily distinguished from media
1060         # playlist based on particular tags availability. As of [1, 2] master
1061         # playlist tags MUST NOT appear in a media playist and vice versa.
1062         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1063         # and MUST NOT appear in master playlist thus we can clearly detect media
1064         # playlist with this criterion.
1065         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1066         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1067         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1068         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1069             return [{
1070                 'url': m3u8_url,
1071                 'format_id': m3u8_id,
1072                 'ext': ext,
1073                 'protocol': entry_protocol,
1074                 'preference': preference,
1075             }]
1076         last_info = None
1077         last_media = None
1078         kv_rex = re.compile(
1079             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1080         for line in m3u8_doc.splitlines():
1081             if line.startswith('#EXT-X-STREAM-INF:'):
1082                 last_info = {}
1083                 for m in kv_rex.finditer(line):
1084                     v = m.group('val')
1085                     if v.startswith('"'):
1086                         v = v[1:-1]
1087                     last_info[m.group('key')] = v
1088             elif line.startswith('#EXT-X-MEDIA:'):
1089                 last_media = {}
1090                 for m in kv_rex.finditer(line):
1091                     v = m.group('val')
1092                     if v.startswith('"'):
1093                         v = v[1:-1]
1094                     last_media[m.group('key')] = v
1095             elif line.startswith('#') or not line.strip():
1096                 continue
1097             else:
1098                 if last_info is None:
1099                     formats.append({'url': format_url(line)})
1100                     continue
1101                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1102                 format_id = []
1103                 if m3u8_id:
1104                     format_id.append(m3u8_id)
1105                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1106                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1107                 f = {
1108                     'format_id': '-'.join(format_id),
1109                     'url': format_url(line.strip()),
1110                     'tbr': tbr,
1111                     'ext': ext,
1112                     'protocol': entry_protocol,
1113                     'preference': preference,
1114                 }
1115                 resolution = last_info.get('RESOLUTION')
1116                 if resolution:
1117                     width_str, height_str = resolution.split('x')
1118                     f['width'] = int(width_str)
1119                     f['height'] = int(height_str)
1120                 codecs = last_info.get('CODECS')
1121                 if codecs:
1122                     vcodec, acodec = [None] * 2
1123                     va_codecs = codecs.split(',')
1124                     if len(va_codecs) == 1:
1125                         # Audio only entries usually come with single codec and
1126                         # no resolution. For more robustness we also check it to
1127                         # be mp4 audio.
1128                         if not resolution and va_codecs[0].startswith('mp4a'):
1129                             vcodec, acodec = 'none', va_codecs[0]
1130                         else:
1131                             vcodec = va_codecs[0]
1132                     else:
1133                         vcodec, acodec = va_codecs[:2]
1134                     f.update({
1135                         'acodec': acodec,
1136                         'vcodec': vcodec,
1137                     })
1138                 if last_media is not None:
1139                     f['m3u8_media'] = last_media
1140                     last_media = None
1141                 formats.append(f)
1142                 last_info = {}
1143         self._sort_formats(formats)
1144         return formats
1145
1146     @staticmethod
1147     def _xpath_ns(path, namespace=None):
1148         if not namespace:
1149             return path
1150         out = []
1151         for c in path.split('/'):
1152             if not c or c == '.':
1153                 out.append(c)
1154             else:
1155                 out.append('{%s}%s' % (namespace, c))
1156         return '/'.join(out)
1157
1158     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1159         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1160
1161         if smil is False:
1162             assert not fatal
1163             return []
1164
1165         namespace = self._parse_smil_namespace(smil)
1166
1167         return self._parse_smil_formats(
1168             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1169
1170     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1171         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1172         if smil is False:
1173             return {}
1174         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1175
1176     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1177         return self._download_xml(
1178             smil_url, video_id, 'Downloading SMIL file',
1179             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1180
1181     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1182         namespace = self._parse_smil_namespace(smil)
1183
1184         formats = self._parse_smil_formats(
1185             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1186         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1187
1188         video_id = os.path.splitext(url_basename(smil_url))[0]
1189         title = None
1190         description = None
1191         upload_date = None
1192         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1193             name = meta.attrib.get('name')
1194             content = meta.attrib.get('content')
1195             if not name or not content:
1196                 continue
1197             if not title and name == 'title':
1198                 title = content
1199             elif not description and name in ('description', 'abstract'):
1200                 description = content
1201             elif not upload_date and name == 'date':
1202                 upload_date = unified_strdate(content)
1203
1204         thumbnails = [{
1205             'id': image.get('type'),
1206             'url': image.get('src'),
1207             'width': int_or_none(image.get('width')),
1208             'height': int_or_none(image.get('height')),
1209         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1210
1211         return {
1212             'id': video_id,
1213             'title': title or video_id,
1214             'description': description,
1215             'upload_date': upload_date,
1216             'thumbnails': thumbnails,
1217             'formats': formats,
1218             'subtitles': subtitles,
1219         }
1220
1221     def _parse_smil_namespace(self, smil):
1222         return self._search_regex(
1223             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1224
1225     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1226         base = smil_url
1227         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1228             b = meta.get('base') or meta.get('httpBase')
1229             if b:
1230                 base = b
1231                 break
1232
1233         formats = []
1234         rtmp_count = 0
1235         http_count = 0
1236         m3u8_count = 0
1237
1238         srcs = []
1239         videos = smil.findall(self._xpath_ns('.//video', namespace))
1240         for video in videos:
1241             src = video.get('src')
1242             if not src or src in srcs:
1243                 continue
1244             srcs.append(src)
1245
1246             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1247             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1248             width = int_or_none(video.get('width'))
1249             height = int_or_none(video.get('height'))
1250             proto = video.get('proto')
1251             ext = video.get('ext')
1252             src_ext = determine_ext(src)
1253             streamer = video.get('streamer') or base
1254
1255             if proto == 'rtmp' or streamer.startswith('rtmp'):
1256                 rtmp_count += 1
1257                 formats.append({
1258                     'url': streamer,
1259                     'play_path': src,
1260                     'ext': 'flv',
1261                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1262                     'tbr': bitrate,
1263                     'filesize': filesize,
1264                     'width': width,
1265                     'height': height,
1266                 })
1267                 if transform_rtmp_url:
1268                     streamer, src = transform_rtmp_url(streamer, src)
1269                     formats[-1].update({
1270                         'url': streamer,
1271                         'play_path': src,
1272                     })
1273                 continue
1274
1275             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1276             src_url = src_url.strip()
1277
1278             if proto == 'm3u8' or src_ext == 'm3u8':
1279                 m3u8_formats = self._extract_m3u8_formats(
1280                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1281                 if len(m3u8_formats) == 1:
1282                     m3u8_count += 1
1283                     m3u8_formats[0].update({
1284                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1285                         'tbr': bitrate,
1286                         'width': width,
1287                         'height': height,
1288                     })
1289                 formats.extend(m3u8_formats)
1290                 continue
1291
1292             if src_ext == 'f4m':
1293                 f4m_url = src_url
1294                 if not f4m_params:
1295                     f4m_params = {
1296                         'hdcore': '3.2.0',
1297                         'plugin': 'flowplayer-3.2.0.1',
1298                     }
1299                 f4m_url += '&' if '?' in f4m_url else '?'
1300                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1301                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1302                 continue
1303
1304             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1305                 http_count += 1
1306                 formats.append({
1307                     'url': src_url,
1308                     'ext': ext or src_ext or 'flv',
1309                     'format_id': 'http-%d' % (bitrate or http_count),
1310                     'tbr': bitrate,
1311                     'filesize': filesize,
1312                     'width': width,
1313                     'height': height,
1314                 })
1315                 continue
1316
1317         self._sort_formats(formats)
1318
1319         return formats
1320
1321     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1322         urls = []
1323         subtitles = {}
1324         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1325             src = textstream.get('src')
1326             if not src or src in urls:
1327                 continue
1328             urls.append(src)
1329             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1330             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1331             subtitles.setdefault(lang, []).append({
1332                 'url': src,
1333                 'ext': ext,
1334             })
1335         return subtitles
1336
1337     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1338         xspf = self._download_xml(
1339             playlist_url, playlist_id, 'Downloading xpsf playlist',
1340             'Unable to download xspf manifest', fatal=fatal)
1341         if xspf is False:
1342             return []
1343         return self._parse_xspf(xspf, playlist_id)
1344
1345     def _parse_xspf(self, playlist, playlist_id):
1346         NS_MAP = {
1347             'xspf': 'http://xspf.org/ns/0/',
1348             's1': 'http://static.streamone.nl/player/ns/0',
1349         }
1350
1351         entries = []
1352         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1353             title = xpath_text(
1354                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1355             description = xpath_text(
1356                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1357             thumbnail = xpath_text(
1358                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1359             duration = float_or_none(
1360                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1361
1362             formats = [{
1363                 'url': location.text,
1364                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1365                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1366                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1367             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1368             self._sort_formats(formats)
1369
1370             entries.append({
1371                 'id': playlist_id,
1372                 'title': title,
1373                 'description': description,
1374                 'thumbnail': thumbnail,
1375                 'duration': duration,
1376                 'formats': formats,
1377             })
1378         return entries
1379
1380     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1381         res = self._download_webpage_handle(
1382             mpd_url, video_id,
1383             note=note or 'Downloading MPD manifest',
1384             errnote=errnote or 'Failed to download MPD manifest',
1385             fatal=fatal)
1386         if res is False:
1387             return []
1388         mpd, urlh = res
1389         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1390
1391         return self._parse_mpd_formats(
1392             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1393
1394     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1395         if mpd_doc.get('type') == 'dynamic':
1396             return []
1397
1398         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1399
1400         def _add_ns(path):
1401             return self._xpath_ns(path, namespace)
1402
1403         def is_drm_protected(element):
1404             return element.find(_add_ns('ContentProtection')) is not None
1405
1406         def extract_multisegment_info(element, ms_parent_info):
1407             ms_info = ms_parent_info.copy()
1408             segment_list = element.find(_add_ns('SegmentList'))
1409             if segment_list is not None:
1410                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1411                 if segment_urls_e:
1412                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1413                 initialization = segment_list.find(_add_ns('Initialization'))
1414                 if initialization is not None:
1415                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1416             else:
1417                 segment_template = element.find(_add_ns('SegmentTemplate'))
1418                 if segment_template is not None:
1419                     start_number = segment_template.get('startNumber')
1420                     if start_number:
1421                         ms_info['start_number'] = int(start_number)
1422                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1423                     if segment_timeline is not None:
1424                         s_e = segment_timeline.findall(_add_ns('S'))
1425                         if s_e:
1426                             ms_info['total_number'] = 0
1427                             for s in s_e:
1428                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1429                     else:
1430                         timescale = segment_template.get('timescale')
1431                         if timescale:
1432                             ms_info['timescale'] = int(timescale)
1433                         segment_duration = segment_template.get('duration')
1434                         if segment_duration:
1435                             ms_info['segment_duration'] = int(segment_duration)
1436                     media_template = segment_template.get('media')
1437                     if media_template:
1438                         ms_info['media_template'] = media_template
1439                     initialization = segment_template.get('initialization')
1440                     if initialization:
1441                         ms_info['initialization_url'] = initialization
1442                     else:
1443                         initialization = segment_template.find(_add_ns('Initialization'))
1444                         if initialization is not None:
1445                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1446             return ms_info
1447
1448         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1449         formats = []
1450         for period in mpd_doc.findall(_add_ns('Period')):
1451             period_duration = parse_duration(period.get('duration')) or mpd_duration
1452             period_ms_info = extract_multisegment_info(period, {
1453                 'start_number': 1,
1454                 'timescale': 1,
1455             })
1456             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1457                 if is_drm_protected(adaptation_set):
1458                     continue
1459                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1460                 for representation in adaptation_set.findall(_add_ns('Representation')):
1461                     if is_drm_protected(representation):
1462                         continue
1463                     representation_attrib = adaptation_set.attrib.copy()
1464                     representation_attrib.update(representation.attrib)
1465                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1466                     mime_type = representation_attrib['mimeType']
1467                     content_type = mime_type.split('/')[0]
1468                     if content_type == 'text':
1469                         # TODO implement WebVTT downloading
1470                         pass
1471                     elif content_type == 'video' or content_type == 'audio':
1472                         base_url = ''
1473                         for element in (representation, adaptation_set, period, mpd_doc):
1474                             base_url_e = element.find(_add_ns('BaseURL'))
1475                             if base_url_e is not None:
1476                                 base_url = base_url_e.text + base_url
1477                                 if re.match(r'^https?://', base_url):
1478                                     break
1479                         if mpd_base_url and not re.match(r'^https?://', base_url):
1480                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1481                                 mpd_base_url += '/'
1482                             base_url = mpd_base_url + base_url
1483                         representation_id = representation_attrib.get('id')
1484                         lang = representation_attrib.get('lang')
1485                         url_el = representation.find(_add_ns('BaseURL'))
1486                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1487                         f = {
1488                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1489                             'url': base_url,
1490                             'ext': mimetype2ext(mime_type),
1491                             'width': int_or_none(representation_attrib.get('width')),
1492                             'height': int_or_none(representation_attrib.get('height')),
1493                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1494                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1495                             'fps': int_or_none(representation_attrib.get('frameRate')),
1496                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1497                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1498                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1499                             'format_note': 'DASH %s' % content_type,
1500                             'filesize': filesize,
1501                         }
1502                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1503                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1504                             if 'total_number' not in representation_ms_info and 'segment_duration':
1505                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1506                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1507                             media_template = representation_ms_info['media_template']
1508                             media_template = media_template.replace('$RepresentationID$', representation_id)
1509                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1510                             media_template.replace('$$', '$')
1511                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1512                         if 'segment_urls' in representation_ms_info:
1513                             f.update({
1514                                 'segment_urls': representation_ms_info['segment_urls'],
1515                                 'protocol': 'http_dash_segments',
1516                             })
1517                             if 'initialization_url' in representation_ms_info:
1518                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1519                                 f.update({
1520                                     'initialization_url': initialization_url,
1521                                 })
1522                                 if not f.get('url'):
1523                                     f['url'] = initialization_url
1524                         try:
1525                             existing_format = next(
1526                                 fo for fo in formats
1527                                 if fo['format_id'] == representation_id)
1528                         except StopIteration:
1529                             full_info = formats_dict.get(representation_id, {}).copy()
1530                             full_info.update(f)
1531                             formats.append(full_info)
1532                         else:
1533                             existing_format.update(f)
1534                     else:
1535                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1536         self._sort_formats(formats)
1537         return formats
1538
1539     def _live_title(self, name):
1540         """ Generate the title for a live video """
1541         now = datetime.datetime.now()
1542         now_str = now.strftime('%Y-%m-%d %H:%M')
1543         return name + ' ' + now_str
1544
1545     def _int(self, v, name, fatal=False, **kwargs):
1546         res = int_or_none(v, **kwargs)
1547         if 'get_attr' in kwargs:
1548             print(getattr(v, kwargs['get_attr']))
1549         if res is None:
1550             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1551             if fatal:
1552                 raise ExtractorError(msg)
1553             else:
1554                 self._downloader.report_warning(msg)
1555         return res
1556
1557     def _float(self, v, name, fatal=False, **kwargs):
1558         res = float_or_none(v, **kwargs)
1559         if res is None:
1560             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1561             if fatal:
1562                 raise ExtractorError(msg)
1563             else:
1564                 self._downloader.report_warning(msg)
1565         return res
1566
1567     def _set_cookie(self, domain, name, value, expire_time=None):
1568         cookie = compat_cookiejar.Cookie(
1569             0, name, value, None, None, domain, None,
1570             None, '/', True, False, expire_time, '', None, None, None)
1571         self._downloader.cookiejar.set_cookie(cookie)
1572
1573     def _get_cookies(self, url):
1574         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1575         req = sanitized_Request(url)
1576         self._downloader.cookiejar.add_cookie_header(req)
1577         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1578
1579     def get_testcases(self, include_onlymatching=False):
1580         t = getattr(self, '_TEST', None)
1581         if t:
1582             assert not hasattr(self, '_TESTS'), \
1583                 '%s has _TEST and _TESTS' % type(self).__name__
1584             tests = [t]
1585         else:
1586             tests = getattr(self, '_TESTS', [])
1587         for t in tests:
1588             if not include_onlymatching and t.get('only_matching', False):
1589                 continue
1590             t['name'] = type(self).__name__[:-len('IE')]
1591             yield t
1592
1593     def is_suitable(self, age_limit):
1594         """ Test whether the extractor is generally suitable for the given
1595         age limit (i.e. pornographic sites are not, all others usually are) """
1596
1597         any_restricted = False
1598         for tc in self.get_testcases(include_onlymatching=False):
1599             if 'playlist' in tc:
1600                 tc = tc['playlist'][0]
1601             is_restricted = age_restricted(
1602                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1603             if not is_restricted:
1604                 return True
1605             any_restricted = any_restricted or is_restricted
1606         return not any_restricted
1607
1608     def extract_subtitles(self, *args, **kwargs):
1609         if (self._downloader.params.get('writesubtitles', False) or
1610                 self._downloader.params.get('listsubtitles')):
1611             return self._get_subtitles(*args, **kwargs)
1612         return {}
1613
1614     def _get_subtitles(self, *args, **kwargs):
1615         raise NotImplementedError('This method must be implemented by subclasses')
1616
1617     @staticmethod
1618     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1619         """ Merge subtitle items for one language. Items with duplicated URLs
1620         will be dropped. """
1621         list1_urls = set([item['url'] for item in subtitle_list1])
1622         ret = list(subtitle_list1)
1623         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1624         return ret
1625
1626     @classmethod
1627     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1628         """ Merge two subtitle dictionaries, language by language. """
1629         ret = dict(subtitle_dict1)
1630         for lang in subtitle_dict2:
1631             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1632         return ret
1633
1634     def extract_automatic_captions(self, *args, **kwargs):
1635         if (self._downloader.params.get('writeautomaticsub', False) or
1636                 self._downloader.params.get('listsubtitles')):
1637             return self._get_automatic_captions(*args, **kwargs)
1638         return {}
1639
1640     def _get_automatic_captions(self, *args, **kwargs):
1641         raise NotImplementedError('This method must be implemented by subclasses')
1642
1643     def mark_watched(self, *args, **kwargs):
1644         if (self._downloader.params.get('mark_watched', False) and
1645                 (self._get_login_info()[0] is not None or
1646                     self._downloader.params.get('cookiefile') is not None)):
1647             self._mark_watched(*args, **kwargs)
1648
1649     def _mark_watched(self, *args, **kwargs):
1650         raise NotImplementedError('This method must be implemented by subclasses')
1651
1652
1653 class SearchInfoExtractor(InfoExtractor):
1654     """
1655     Base class for paged search queries extractors.
1656     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1657     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1658     """
1659
1660     @classmethod
1661     def _make_valid_url(cls):
1662         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1663
1664     @classmethod
1665     def suitable(cls, url):
1666         return re.match(cls._make_valid_url(), url) is not None
1667
1668     def _real_extract(self, query):
1669         mobj = re.match(self._make_valid_url(), query)
1670         if mobj is None:
1671             raise ExtractorError('Invalid search query "%s"' % query)
1672
1673         prefix = mobj.group('prefix')
1674         query = mobj.group('query')
1675         if prefix == '':
1676             return self._get_n_results(query, 1)
1677         elif prefix == 'all':
1678             return self._get_n_results(query, self._MAX_RESULTS)
1679         else:
1680             n = int(prefix)
1681             if n <= 0:
1682                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1683             elif n > self._MAX_RESULTS:
1684                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1685                 n = self._MAX_RESULTS
1686             return self._get_n_results(query, n)
1687
1688     def _get_n_results(self, query, n):
1689         """Get a specified number of results for a query"""
1690         raise NotImplementedError('This method must be implemented by subclasses')
1691
1692     @property
1693     def SEARCH_KEY(self):
1694         return self._SEARCH_KEY