_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51     update_url_query,
  52 )
  53
  54
  55 class InfoExtractor(object):
  56     """Information Extractor class.
  57
  58     Information extractors are the classes that, given a URL, extract
  59     information about the video (or videos) the URL refers to. This
  60     information includes the real video URL, the video title, author and
  61     others. The information is stored in a dictionary which is then
  62     passed to the YoutubeDL. The YoutubeDL processes this
  63     information possibly downloading the video to the file system, among
  64     other possible outcomes.
  65
  66     The type field determines the type of the result.
  67     By far the most common value (and the default if _type is missing) is
  68     "video", which indicates a single video.
  69
  70     For a video, the dictionaries must include the following fields:
  71
  72     id:             Video identifier.
  73     title:          Video title, unescaped.
  74
  75     Additionally, it must contain either a formats entry or a url one:
  76
  77     formats:        A list of dictionaries for each format available, ordered
  78                     from worst to best quality.
  79
  80                     Potential fields:
  81                     * url        Mandatory. The URL of the video file
  82                     * ext        Will be calculated from URL if missing
  83                     * format     A human-readable description of the format
  84                                  ("mp4 container with h264/opus").
  85                                  Calculated from the format_id, width, height.
  86                                  and format_note fields if missing.
  87                     * format_id  A short description of the format
  88                                  ("mp4_h264_opus" or "19").
  89                                 Technically optional, but strongly recommended.
  90                     * format_note Additional info about the format
  91                                  ("3D" or "DASH video")
  92                     * width      Width of the video, if known
  93                     * height     Height of the video, if known
  94                     * resolution Textual description of width and height
  95                     * tbr        Average bitrate of audio and video in KBit/s
  96                     * abr        Average audio bitrate in KBit/s
  97                     * acodec     Name of the audio codec in use
  98                     * asr        Audio sampling rate in Hertz
  99                     * vbr        Average video bitrate in KBit/s
 100                     * fps        Frame rate
 101                     * vcodec     Name of the video codec in use
 102                     * container  Name of the container format
 103                     * filesize   The number of bytes, if known in advance
 104                     * filesize_approx  An estimate for the number of bytes
 105                     * player_url SWF Player URL (used for rtmpdump).
 106                     * protocol   The protocol that will be used for the actual
 107                                  download, lower-case.
 108                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 109                                  "m3u8", "m3u8_native" or "http_dash_segments".
 110                     * preference Order number of this format. If this field is
 111                                  present and not None, the formats get sorted
 112                                  by this field, regardless of all other values.
 113                                  -1 for default (order by other properties),
 114                                  -2 or smaller for less than default.
 115                                  < -1000 to hide the format (if there is
 116                                     another one which is strictly better)
 117                     * language   Language code, e.g. "de" or "en-US".
 118                     * language_preference  Is this in the language mentioned in
 119                                  the URL?
 120                                  10 if it's what the URL is about,
 121                                  -1 for default (don't know),
 122                                  -10 otherwise, other values reserved for now.
 123                     * quality    Order number of the video quality of this
 124                                  format, irrespective of the file format.
 125                                  -1 for default (order by other properties),
 126                                  -2 or smaller for less than default.
 127                     * source_preference  Order number for this video source
 128                                   (quality takes higher priority)
 129                                  -1 for default (order by other properties),
 130                                  -2 or smaller for less than default.
 131                     * http_headers  A dictionary of additional HTTP headers
 132                                  to add to the request.
 133                     * stretched_ratio  If given and not 1, indicates that the
 134                                  video's pixels are not square.
 135                                  width : height ratio as float.
 136                     * no_resume  The server does not support resuming the
 137                                  (HTTP or RTMP) download. Boolean.
 138
 139     url:            Final video URL.
 140     ext:            Video filename extension.
 141     format:         The video format, defaults to ext (used for --get-format)
 142     player_url:     SWF Player URL (used for rtmpdump).
 143
 144     The following fields are optional:
 145
 146     alt_title:      A secondary title of the video.
 147     display_id      An alternative identifier for the video, not necessarily
 148                     unique, but available before title. Typically, id is
 149                     something like "4234987", title "Dancing naked mole rats",
 150                     and display_id "dancing-naked-mole-rats"
 151     thumbnails:     A list of dictionaries, with the following entries:
 152                         * "id" (optional, string) - Thumbnail format ID
 153                         * "url"
 154                         * "preference" (optional, int) - quality of the image
 155                         * "width" (optional, int)
 156                         * "height" (optional, int)
 157                         * "resolution" (optional, string "{width}x{height"},
 158                                         deprecated)
 159     thumbnail:      Full URL to a video thumbnail image.
 160     description:    Full video description.
 161     uploader:       Full name of the video uploader.
 162     license:        License name the video is licensed under.
 163     creator:        The main artist who created the video.
 164     release_date:   The date (YYYYMMDD) when the video was released.
 165     timestamp:      UNIX timestamp of the moment the video became available.
 166     upload_date:    Video upload date (YYYYMMDD).
 167                     If not explicitly set, calculated from timestamp.
 168     uploader_id:    Nickname or id of the video uploader.
 169     uploader_url:   Full URL to a personal webpage of the video uploader.
 170     location:       Physical location where the video was filmed.
 171     subtitles:      The available subtitles as a dictionary in the format
 172                     {language: subformats}. "subformats" is a list sorted from
 173                     lower to higher preference, each element is a dictionary
 174                     with the "ext" entry and one of:
 175                         * "data": The subtitles file contents
 176                         * "url": A URL pointing to the subtitles file
 177                     "ext" will be calculated from URL if missing
 178     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 179                     automatically generated captions
 180     duration:       Length of the video in seconds, as an integer or float.
 181     view_count:     How many users have watched the video on the platform.
 182     like_count:     Number of positive ratings of the video
 183     dislike_count:  Number of negative ratings of the video
 184     repost_count:   Number of reposts of the video
 185     average_rating: Average rating give by users, the scale used depends on the webpage
 186     comment_count:  Number of comments on the video
 187     comments:       A list of comments, each with one or more of the following
 188                     properties (all but one of text or html optional):
 189                         * "author" - human-readable name of the comment author
 190                         * "author_id" - user ID of the comment author
 191                         * "id" - Comment ID
 192                         * "html" - Comment as HTML
 193                         * "text" - Plain text of the comment
 194                         * "timestamp" - UNIX timestamp of comment
 195                         * "parent" - ID of the comment this one is replying to.
 196                                      Set to "root" to indicate that this is a
 197                                      comment to the original video.
 198     age_limit:      Age restriction for the video, as an integer (years)
 199     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 200                     should allow to get the same result again. (It will be set
 201                     by YoutubeDL if it's missing)
 202     categories:     A list of categories that the video falls in, for example
 203                     ["Sports", "Berlin"]
 204     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 205     is_live:        True, False, or None (=unknown). Whether this video is a
 206                     live stream that goes on instead of a fixed-length video.
 207     start_time:     Time in seconds where the reproduction should start, as
 208                     specified in the URL.
 209     end_time:       Time in seconds where the reproduction should end, as
 210                     specified in the URL.
 211
 212     The following fields should only be used when the video belongs to some logical
 213     chapter or section:
 214
 215     chapter:        Name or title of the chapter the video belongs to.
 216     chapter_number: Number of the chapter the video belongs to, as an integer.
 217     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 218
 219     The following fields should only be used when the video is an episode of some
 220     series or programme:
 221
 222     series:         Title of the series or programme the video episode belongs to.
 223     season:         Title of the season the video episode belongs to.
 224     season_number:  Number of the season the video episode belongs to, as an integer.
 225     season_id:      Id of the season the video episode belongs to, as a unicode string.
 226     episode:        Title of the video episode. Unlike mandatory video title field,
 227                     this field should denote the exact title of the video episode
 228                     without any kind of decoration.
 229     episode_number: Number of the video episode within a season, as an integer.
 230     episode_id:     Id of the video episode, as a unicode string.
 231
 232     Unless mentioned otherwise, the fields should be Unicode strings.
 233
 234     Unless mentioned otherwise, None is equivalent to absence of information.
 235
 236
 237     _type "playlist" indicates multiple videos.
 238     There must be a key "entries", which is a list, an iterable, or a PagedList
 239     object, each element of which is a valid dictionary by this specification.
 240
 241     Additionally, playlists can have "title", "description" and "id" attributes
 242     with the same semantics as videos (see above).
 243
 244
 245     _type "multi_video" indicates that there are multiple videos that
 246     form a single show, for examples multiple acts of an opera or TV episode.
 247     It must have an entries key like a playlist and contain all the keys
 248     required for a video at the same time.
 249
 250
 251     _type "url" indicates that the video must be extracted from another
 252     location, possibly by a different extractor. Its only required key is:
 253     "url" - the next URL to extract.
 254     The key "ie_key" can be set to the class name (minus the trailing "IE",
 255     e.g. "Youtube") if the extractor class is known in advance.
 256     Additionally, the dictionary may have any properties of the resolved entity
 257     known in advance, for example "title" if the title of the referred video is
 258     known ahead of time.
 259
 260
 261     _type "url_transparent" entities have the same specification as "url", but
 262     indicate that the given additional information is more precise than the one
 263     associated with the resolved URL.
 264     This is useful when a site employs a video service that hosts the video and
 265     its technical metadata, but that video service does not embed a useful
 266     title, description etc.
 267
 268
 269     Subclasses of this one should re-define the _real_initialize() and
 270     _real_extract() methods and define a _VALID_URL regexp.
 271     Probably, they should also be added to the list of extractors.
 272
 273     Finally, the _WORKING attribute should be set to False for broken IEs
 274     in order to warn the users and skip the tests.
 275     """
 276
 277     _ready = False
 278     _downloader = None
 279     _WORKING = True
 280
 281     def __init__(self, downloader=None):
 282         """Constructor. Receives an optional downloader."""
 283         self._ready = False
 284         self.set_downloader(downloader)
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         """Receives a URL and returns True if suitable for this IE."""
 289
 290         # This does not use has/getattr intentionally - we want to know whether
 291         # we have cached the regexp for *this* class, whereas getattr would also
 292         # match the superclass
 293         if '_VALID_URL_RE' not in cls.__dict__:
 294             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 295         return cls._VALID_URL_RE.match(url) is not None
 296
 297     @classmethod
 298     def _match_id(cls, url):
 299         if '_VALID_URL_RE' not in cls.__dict__:
 300             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 301         m = cls._VALID_URL_RE.match(url)
 302         assert m
 303         return m.group('id')
 304
 305     @classmethod
 306     def working(cls):
 307         """Getter method for _WORKING."""
 308         return cls._WORKING
 309
 310     def initialize(self):
 311         """Initializes an instance (authentication, etc)."""
 312         if not self._ready:
 313             self._real_initialize()
 314             self._ready = True
 315
 316     def extract(self, url):
 317         """Extracts URL information and returns it in list of dicts."""
 318         try:
 319             self.initialize()
 320             return self._real_extract(url)
 321         except ExtractorError:
 322             raise
 323         except compat_http_client.IncompleteRead as e:
 324             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 325         except (KeyError, StopIteration) as e:
 326             raise ExtractorError('An extractor error has occurred.', cause=e)
 327
 328     def set_downloader(self, downloader):
 329         """Sets the downloader for this IE."""
 330         self._downloader = downloader
 331
 332     def _real_initialize(self):
 333         """Real initialization process. Redefine in subclasses."""
 334         pass
 335
 336     def _real_extract(self, url):
 337         """Real extraction process. Redefine in subclasses."""
 338         pass
 339
 340     @classmethod
 341     def ie_key(cls):
 342         """A string for getting the InfoExtractor with get_info_extractor"""
 343         return compat_str(cls.__name__[:-2])
 344
 345     @property
 346     def IE_NAME(self):
 347         return compat_str(type(self).__name__[:-2])
 348
 349     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
 350         """ Returns the response handle """
 351         if note is None:
 352             self.report_download_webpage(video_id)
 353         elif note is not False:
 354             if video_id is None:
 355                 self.to_screen('%s' % (note,))
 356             else:
 357                 self.to_screen('%s: %s' % (video_id, note))
 358         # data, headers and query params will be ignored for `Request` objects
 359         if isinstance(url_or_request, compat_str):
 360             if query:
 361                 url_or_request = update_url_query(url_or_request, query)
 362             if data or headers:
 363                 url_or_request = sanitized_Request(url_or_request, data, headers or {})
 364         try:
 365             return self._downloader.urlopen(url_or_request)
 366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 367             if errnote is False:
 368                 return False
 369             if errnote is None:
 370                 errnote = 'Unable to download webpage'
 371
 372             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 373             if fatal:
 374                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 375             else:
 376                 self._downloader.report_warning(errmsg)
 377                 return False
 378
 379     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 380         """ Returns a tuple (page content as string, URL handle) """
 381         # Strip hashes from the URL (#1038)
 382         if isinstance(url_or_request, (compat_str, str)):
 383             url_or_request = url_or_request.partition('#')[0]
 384
 385         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 386         if urlh is False:
 387             assert not fatal
 388             return False
 389         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 390         return (content, urlh)
 391
 392     @staticmethod
 393     def _guess_encoding_from_content(content_type, webpage_bytes):
 394         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 395         if m:
 396             encoding = m.group(1)
 397         else:
 398             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 399                           webpage_bytes[:1024])
 400             if m:
 401                 encoding = m.group(1).decode('ascii')
 402             elif webpage_bytes.startswith(b'\xff\xfe'):
 403                 encoding = 'utf-16'
 404             else:
 405                 encoding = 'utf-8'
 406
 407         return encoding
 408
 409     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 410         content_type = urlh.headers.get('Content-Type', '')
 411         webpage_bytes = urlh.read()
 412         if prefix is not None:
 413             webpage_bytes = prefix + webpage_bytes
 414         if not encoding:
 415             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 416         if self._downloader.params.get('dump_intermediate_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             self.to_screen('Dumping request to ' + url)
 422             dump = base64.b64encode(webpage_bytes).decode('ascii')
 423             self._downloader.to_screen(dump)
 424         if self._downloader.params.get('write_pages', False):
 425             try:
 426                 url = url_or_request.get_full_url()
 427             except AttributeError:
 428                 url = url_or_request
 429             basen = '%s_%s' % (video_id, url)
 430             if len(basen) > 240:
 431                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 432                 basen = basen[:240 - len(h)] + h
 433             raw_filename = basen + '.dump'
 434             filename = sanitize_filename(raw_filename, restricted=True)
 435             self.to_screen('Saving request to ' + filename)
 436             # Working around MAX_PATH limitation on Windows (see
 437             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 438             if compat_os_name == 'nt':
 439                 absfilepath = os.path.abspath(filename)
 440                 if len(absfilepath) > 259:
 441                     filename = '\\\\?\\' + absfilepath
 442             with open(filename, 'wb') as outf:
 443                 outf.write(webpage_bytes)
 444
 445         try:
 446             content = webpage_bytes.decode(encoding, 'replace')
 447         except LookupError:
 448             content = webpage_bytes.decode('utf-8', 'replace')
 449
 450         if ('<title>Access to this site is blocked</title>' in content and
 451                 'Websense' in content[:512]):
 452             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 453             blocked_iframe = self._html_search_regex(
 454                 r'<iframe src="([^"]+)"', content,
 455                 'Websense information URL', default=None)
 456             if blocked_iframe:
 457                 msg += ' Visit %s for more details' % blocked_iframe
 458             raise ExtractorError(msg, expected=True)
 459         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 460             msg = (
 461                 'Access to this webpage has been blocked by Indian censorship. '
 462                 'Use a VPN or proxy server (with --proxy) to route around it.')
 463             block_msg = self._html_search_regex(
 464                 r'</h1><p>(.*?)</p>',
 465                 content, 'block message', default=None)
 466             if block_msg:
 467                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 468             raise ExtractorError(msg, expected=True)
 469
 470         return content
 471
 472     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
 473         """ Returns the data of the page as a string """
 474         success = False
 475         try_count = 0
 476         while success is False:
 477             try:
 478                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 479                 success = True
 480             except compat_http_client.IncompleteRead as e:
 481                 try_count += 1
 482                 if try_count >= tries:
 483                     raise e
 484                 self._sleep(timeout, video_id)
 485         if res is False:
 486             return res
 487         else:
 488             content, _ = res
 489             return content
 490
 491     def _download_xml(self, url_or_request, video_id,
 492                       note='Downloading XML', errnote='Unable to download XML',
 493                       transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 494         """Return the xml as an xml.etree.ElementTree.Element"""
 495         xml_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 497         if xml_string is False:
 498             return xml_string
 499         if transform_source:
 500             xml_string = transform_source(xml_string)
 501         return compat_etree_fromstring(xml_string.encode('utf-8'))
 502
 503     def _download_json(self, url_or_request, video_id,
 504                        note='Downloading JSON metadata',
 505                        errnote='Unable to download JSON metadata',
 506                        transform_source=None,
 507                        fatal=True, encoding=None, data=None, headers=None, query=None):
 508         json_string = self._download_webpage(
 509             url_or_request, video_id, note, errnote, fatal=fatal,
 510             encoding=encoding, data=data, headers=headers, query=query)
 511         if (not fatal) and json_string is False:
 512             return None
 513         return self._parse_json(
 514             json_string, video_id, transform_source=transform_source, fatal=fatal)
 515
 516     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 517         if transform_source:
 518             json_string = transform_source(json_string)
 519         try:
 520             return json.loads(json_string)
 521         except ValueError as ve:
 522             errmsg = '%s: Failed to parse JSON ' % video_id
 523             if fatal:
 524                 raise ExtractorError(errmsg, cause=ve)
 525             else:
 526                 self.report_warning(errmsg + str(ve))
 527
 528     def report_warning(self, msg, video_id=None):
 529         idstr = '' if video_id is None else '%s: ' % video_id
 530         self._downloader.report_warning(
 531             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 532
 533     def to_screen(self, msg):
 534         """Print msg to screen, prefixing it with '[ie_name]'"""
 535         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 536
 537     def report_extraction(self, id_or_name):
 538         """Report information extraction."""
 539         self.to_screen('%s: Extracting information' % id_or_name)
 540
 541     def report_download_webpage(self, video_id):
 542         """Report webpage download."""
 543         self.to_screen('%s: Downloading webpage' % video_id)
 544
 545     def report_age_confirmation(self):
 546         """Report attempt to confirm age."""
 547         self.to_screen('Confirming age')
 548
 549     def report_login(self):
 550         """Report attempt to log in."""
 551         self.to_screen('Logging in')
 552
 553     @staticmethod
 554     def raise_login_required(msg='This video is only available for registered users'):
 555         raise ExtractorError(
 556             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 557             expected=True)
 558
 559     @staticmethod
 560     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 561         raise ExtractorError(
 562             '%s. You might want to use --proxy to workaround.' % msg,
 563             expected=True)
 564
 565     # Methods for following #608
 566     @staticmethod
 567     def url_result(url, ie=None, video_id=None, video_title=None):
 568         """Returns a URL that points to a page that should be processed"""
 569         # TODO: ie should be the class used for getting the info
 570         video_info = {'_type': 'url',
 571                       'url': url,
 572                       'ie_key': ie}
 573         if video_id is not None:
 574             video_info['id'] = video_id
 575         if video_title is not None:
 576             video_info['title'] = video_title
 577         return video_info
 578
 579     @staticmethod
 580     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 581         """Returns a playlist"""
 582         video_info = {'_type': 'playlist',
 583                       'entries': entries}
 584         if playlist_id:
 585             video_info['id'] = playlist_id
 586         if playlist_title:
 587             video_info['title'] = playlist_title
 588         if playlist_description:
 589             video_info['description'] = playlist_description
 590         return video_info
 591
 592     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Perform a regex search on the given string, using a single or a list of
 595         patterns returning the first matching group.
 596         In case of failure return a default value or raise a WARNING or a
 597         RegexNotFoundError, depending on fatal, specifying the field name.
 598         """
 599         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 600             mobj = re.search(pattern, string, flags)
 601         else:
 602             for p in pattern:
 603                 mobj = re.search(p, string, flags)
 604                 if mobj:
 605                     break
 606
 607         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 608             _name = '\033[0;34m%s\033[0m' % name
 609         else:
 610             _name = name
 611
 612         if mobj:
 613             if group is None:
 614                 # return the first matching group
 615                 return next(g for g in mobj.groups() if g is not None)
 616             else:
 617                 return mobj.group(group)
 618         elif default is not NO_DEFAULT:
 619             return default
 620         elif fatal:
 621             raise RegexNotFoundError('Unable to extract %s' % _name)
 622         else:
 623             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 624             return None
 625
 626     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 627         """
 628         Like _search_regex, but strips HTML tags and unescapes entities.
 629         """
 630         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 631         if res:
 632             return clean_html(res).strip()
 633         else:
 634             return res
 635
 636     def _get_login_info(self):
 637         """
 638         Get the login info as (username, password)
 639         It will look in the netrc file using the _NETRC_MACHINE value
 640         If there's no info available, return (None, None)
 641         """
 642         if self._downloader is None:
 643             return (None, None)
 644
 645         username = None
 646         password = None
 647         downloader_params = self._downloader.params
 648
 649         # Attempt to use provided username and password or .netrc data
 650         if downloader_params.get('username') is not None:
 651             username = downloader_params['username']
 652             password = downloader_params['password']
 653         elif downloader_params.get('usenetrc', False):
 654             try:
 655                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 656                 if info is not None:
 657                     username = info[0]
 658                     password = info[2]
 659                 else:
 660                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 661             except (IOError, netrc.NetrcParseError) as err:
 662                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 663
 664         return (username, password)
 665
 666     def _get_tfa_info(self, note='two-factor verification code'):
 667         """
 668         Get the two-factor authentication info
 669         TODO - asking the user will be required for sms/phone verify
 670         currently just uses the command line option
 671         If there's no info available, return None
 672         """
 673         if self._downloader is None:
 674             return None
 675         downloader_params = self._downloader.params
 676
 677         if downloader_params.get('twofactor') is not None:
 678             return downloader_params['twofactor']
 679
 680         return compat_getpass('Type %s and press [Return]: ' % note)
 681
 682     # Helper functions for extracting OpenGraph info
 683     @staticmethod
 684     def _og_regexes(prop):
 685         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 686         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 687                        % {'prop': re.escape(prop)})
 688         template = r'<meta[^>]+?%s[^>]+?%s'
 689         return [
 690             template % (property_re, content_re),
 691             template % (content_re, property_re),
 692         ]
 693
 694     @staticmethod
 695     def _meta_regex(prop):
 696         return r'''(?isx)<meta
 697                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 698                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 699
 700     def _og_search_property(self, prop, html, name=None, **kargs):
 701         if name is None:
 702             name = 'OpenGraph %s' % prop
 703         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 704         if escaped is None:
 705             return None
 706         return unescapeHTML(escaped)
 707
 708     def _og_search_thumbnail(self, html, **kargs):
 709         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 710
 711     def _og_search_description(self, html, **kargs):
 712         return self._og_search_property('description', html, fatal=False, **kargs)
 713
 714     def _og_search_title(self, html, **kargs):
 715         return self._og_search_property('title', html, **kargs)
 716
 717     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 718         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 719         if secure:
 720             regexes = self._og_regexes('video:secure_url') + regexes
 721         return self._html_search_regex(regexes, html, name, **kargs)
 722
 723     def _og_search_url(self, html, **kargs):
 724         return self._og_search_property('url', html, **kargs)
 725
 726     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 727         if display_name is None:
 728             display_name = name
 729         return self._html_search_regex(
 730             self._meta_regex(name),
 731             html, display_name, fatal=fatal, group='content', **kwargs)
 732
 733     def _dc_search_uploader(self, html):
 734         return self._html_search_meta('dc.creator', html, 'uploader')
 735
 736     def _rta_search(self, html):
 737         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 738         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 739                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 740                      html):
 741             return 18
 742         return 0
 743
 744     def _media_rating_search(self, html):
 745         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 746         rating = self._html_search_meta('rating', html)
 747
 748         if not rating:
 749             return None
 750
 751         RATING_TABLE = {
 752             'safe for kids': 0,
 753             'general': 8,
 754             '14 years': 14,
 755             'mature': 17,
 756             'restricted': 19,
 757         }
 758         return RATING_TABLE.get(rating.lower())
 759
 760     def _family_friendly_search(self, html):
 761         # See http://schema.org/VideoObject
 762         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 763
 764         if not family_friendly:
 765             return None
 766
 767         RATING_TABLE = {
 768             '1': 0,
 769             'true': 0,
 770             '0': 18,
 771             'false': 18,
 772         }
 773         return RATING_TABLE.get(family_friendly.lower())
 774
 775     def _twitter_search_player(self, html):
 776         return self._html_search_meta('twitter:player', html,
 777                                       'twitter card player')
 778
 779     def _search_json_ld(self, html, video_id, **kwargs):
 780         json_ld = self._search_regex(
 781             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 782             html, 'JSON-LD', group='json_ld', **kwargs)
 783         if not json_ld:
 784             return {}
 785         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 786
 787     def _json_ld(self, json_ld, video_id, fatal=True):
 788         if isinstance(json_ld, compat_str):
 789             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 790         if not json_ld:
 791             return {}
 792         info = {}
 793         if json_ld.get('@context') == 'http://schema.org':
 794             item_type = json_ld.get('@type')
 795             if item_type == 'TVEpisode':
 796                 info.update({
 797                     'episode': unescapeHTML(json_ld.get('name')),
 798                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 799                     'description': unescapeHTML(json_ld.get('description')),
 800                 })
 801                 part_of_season = json_ld.get('partOfSeason')
 802                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 803                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 804                 part_of_series = json_ld.get('partOfSeries')
 805                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 806                     info['series'] = unescapeHTML(part_of_series.get('name'))
 807             elif item_type == 'Article':
 808                 info.update({
 809                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 810                     'title': unescapeHTML(json_ld.get('headline')),
 811                     'description': unescapeHTML(json_ld.get('articleBody')),
 812                 })
 813         return dict((k, v) for k, v in info.items() if v is not None)
 814
 815     @staticmethod
 816     def _hidden_inputs(html):
 817         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 818         hidden_inputs = {}
 819         for input in re.findall(r'(?i)<input([^>]+)>', html):
 820             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 821                 continue
 822             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 823             if not name:
 824                 continue
 825             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 826             if not value:
 827                 continue
 828             hidden_inputs[name.group('value')] = value.group('value')
 829         return hidden_inputs
 830
 831     def _form_hidden_inputs(self, form_id, html):
 832         form = self._search_regex(
 833             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 834             html, '%s form' % form_id, group='form')
 835         return self._hidden_inputs(form)
 836
 837     def _sort_formats(self, formats, field_preference=None):
 838         if not formats:
 839             raise ExtractorError('No video formats found')
 840
 841         for f in formats:
 842             # Automatically determine tbr when missing based on abr and vbr (improves
 843             # formats sorting in some cases)
 844             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 845                 f['tbr'] = f['abr'] + f['vbr']
 846
 847         def _formats_key(f):
 848             # TODO remove the following workaround
 849             from ..utils import determine_ext
 850             if not f.get('ext') and 'url' in f:
 851                 f['ext'] = determine_ext(f['url'])
 852
 853             if isinstance(field_preference, (list, tuple)):
 854                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 855
 856             preference = f.get('preference')
 857             if preference is None:
 858                 preference = 0
 859                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 860                     preference -= 0.5
 861
 862             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 863
 864             if f.get('vcodec') == 'none':  # audio only
 865                 preference -= 50
 866                 if self._downloader.params.get('prefer_free_formats'):
 867                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 868                 else:
 869                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 870                 ext_preference = 0
 871                 try:
 872                     audio_ext_preference = ORDER.index(f['ext'])
 873                 except ValueError:
 874                     audio_ext_preference = -1
 875             else:
 876                 if f.get('acodec') == 'none':  # video only
 877                     preference -= 40
 878                 if self._downloader.params.get('prefer_free_formats'):
 879                     ORDER = ['flv', 'mp4', 'webm']
 880                 else:
 881                     ORDER = ['webm', 'flv', 'mp4']
 882                 try:
 883                     ext_preference = ORDER.index(f['ext'])
 884                 except ValueError:
 885                     ext_preference = -1
 886                 audio_ext_preference = 0
 887
 888             return (
 889                 preference,
 890                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 891                 f.get('quality') if f.get('quality') is not None else -1,
 892                 f.get('tbr') if f.get('tbr') is not None else -1,
 893                 f.get('filesize') if f.get('filesize') is not None else -1,
 894                 f.get('vbr') if f.get('vbr') is not None else -1,
 895                 f.get('height') if f.get('height') is not None else -1,
 896                 f.get('width') if f.get('width') is not None else -1,
 897                 proto_preference,
 898                 ext_preference,
 899                 f.get('abr') if f.get('abr') is not None else -1,
 900                 audio_ext_preference,
 901                 f.get('fps') if f.get('fps') is not None else -1,
 902                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 903                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 904                 f.get('format_id') if f.get('format_id') is not None else '',
 905             )
 906         formats.sort(key=_formats_key)
 907
 908     def _check_formats(self, formats, video_id):
 909         if formats:
 910             formats[:] = filter(
 911                 lambda f: self._is_valid_url(
 912                     f['url'], video_id,
 913                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 914                 formats)
 915
 916     @staticmethod
 917     def _remove_duplicate_formats(formats):
 918         format_urls = set()
 919         unique_formats = []
 920         for f in formats:
 921             if f['url'] not in format_urls:
 922                 format_urls.add(f['url'])
 923                 unique_formats.append(f)
 924         formats[:] = unique_formats
 925
 926     def _is_valid_url(self, url, video_id, item='video'):
 927         url = self._proto_relative_url(url, scheme='http:')
 928         # For now assume non HTTP(S) URLs always valid
 929         if not (url.startswith('http://') or url.startswith('https://')):
 930             return True
 931         try:
 932             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 933             return True
 934         except ExtractorError as e:
 935             if isinstance(e.cause, compat_urllib_error.URLError):
 936                 self.to_screen(
 937                     '%s: %s URL is invalid, skipping' % (video_id, item))
 938                 return False
 939             raise
 940
 941     def http_scheme(self):
 942         """ Either "http:" or "https:", depending on the user's preferences """
 943         return (
 944             'http:'
 945             if self._downloader.params.get('prefer_insecure', False)
 946             else 'https:')
 947
 948     def _proto_relative_url(self, url, scheme=None):
 949         if url is None:
 950             return url
 951         if url.startswith('//'):
 952             if scheme is None:
 953                 scheme = self.http_scheme()
 954             return scheme + url
 955         else:
 956             return url
 957
 958     def _sleep(self, timeout, video_id, msg_template=None):
 959         if msg_template is None:
 960             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 961         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 962         self.to_screen(msg)
 963         time.sleep(timeout)
 964
 965     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 966                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 967                              fatal=True):
 968         manifest = self._download_xml(
 969             manifest_url, video_id, 'Downloading f4m manifest',
 970             'Unable to download f4m manifest',
 971             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 972             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 973             transform_source=transform_source,
 974             fatal=fatal)
 975
 976         if manifest is False:
 977             return []
 978
 979         return self._parse_f4m_formats(
 980             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 981             transform_source=transform_source, fatal=fatal)
 982
 983     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 984                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 985                            fatal=True):
 986         formats = []
 987         manifest_version = '1.0'
 988         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 989         if not media_nodes:
 990             manifest_version = '2.0'
 991             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 992         base_url = xpath_text(
 993             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 994             'base URL', default=None)
 995         if base_url:
 996             base_url = base_url.strip()
 997         for i, media_el in enumerate(media_nodes):
 998             if manifest_version == '2.0':
 999                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1000                 if not media_url:
1001                     continue
1002                 manifest_url = (
1003                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1004                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1005                 # If media_url is itself a f4m manifest do the recursive extraction
1006                 # since bitrates in parent manifest (this one) and media_url manifest
1007                 # may differ leading to inability to resolve the format by requested
1008                 # bitrate in f4m downloader
1009                 if determine_ext(manifest_url) == 'f4m':
1010                     formats.extend(self._extract_f4m_formats(
1011                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1012                         transform_source=transform_source, fatal=fatal))
1013                     continue
1014             tbr = int_or_none(media_el.attrib.get('bitrate'))
1015             formats.append({
1016                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1017                 'url': manifest_url,
1018                 'ext': 'flv',
1019                 'tbr': tbr,
1020                 'width': int_or_none(media_el.attrib.get('width')),
1021                 'height': int_or_none(media_el.attrib.get('height')),
1022                 'preference': preference,
1023             })
1024         return formats
1025
1026     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1027                               entry_protocol='m3u8', preference=None,
1028                               m3u8_id=None, note=None, errnote=None,
1029                               fatal=True):
1030
1031         formats = [{
1032             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1033             'url': m3u8_url,
1034             'ext': ext,
1035             'protocol': 'm3u8',
1036             'preference': preference - 1 if preference else -1,
1037             'resolution': 'multiple',
1038             'format_note': 'Quality selection URL',
1039         }]
1040
1041         format_url = lambda u: (
1042             u
1043             if re.match(r'^https?://', u)
1044             else compat_urlparse.urljoin(m3u8_url, u))
1045
1046         res = self._download_webpage_handle(
1047             m3u8_url, video_id,
1048             note=note or 'Downloading m3u8 information',
1049             errnote=errnote or 'Failed to download m3u8 information',
1050             fatal=fatal)
1051         if res is False:
1052             return []
1053         m3u8_doc, urlh = res
1054         m3u8_url = urlh.geturl()
1055
1056         # We should try extracting formats only from master playlists [1], i.e.
1057         # playlists that describe available qualities. On the other hand media
1058         # playlists [2] should be returned as is since they contain just the media
1059         # without qualities renditions.
1060         # Fortunately, master playlist can be easily distinguished from media
1061         # playlist based on particular tags availability. As of [1, 2] master
1062         # playlist tags MUST NOT appear in a media playist and vice versa.
1063         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1064         # and MUST NOT appear in master playlist thus we can clearly detect media
1065         # playlist with this criterion.
1066         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1067         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1068         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1069         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1070             return [{
1071                 'url': m3u8_url,
1072                 'format_id': m3u8_id,
1073                 'ext': ext,
1074                 'protocol': entry_protocol,
1075                 'preference': preference,
1076             }]
1077         last_info = None
1078         last_media = None
1079         kv_rex = re.compile(
1080             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1081         for line in m3u8_doc.splitlines():
1082             if line.startswith('#EXT-X-STREAM-INF:'):
1083                 last_info = {}
1084                 for m in kv_rex.finditer(line):
1085                     v = m.group('val')
1086                     if v.startswith('"'):
1087                         v = v[1:-1]
1088                     last_info[m.group('key')] = v
1089             elif line.startswith('#EXT-X-MEDIA:'):
1090                 last_media = {}
1091                 for m in kv_rex.finditer(line):
1092                     v = m.group('val')
1093                     if v.startswith('"'):
1094                         v = v[1:-1]
1095                     last_media[m.group('key')] = v
1096             elif line.startswith('#') or not line.strip():
1097                 continue
1098             else:
1099                 if last_info is None:
1100                     formats.append({'url': format_url(line)})
1101                     continue
1102                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1103                 format_id = []
1104                 if m3u8_id:
1105                     format_id.append(m3u8_id)
1106                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1107                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1108                 f = {
1109                     'format_id': '-'.join(format_id),
1110                     'url': format_url(line.strip()),
1111                     'tbr': tbr,
1112                     'ext': ext,
1113                     'protocol': entry_protocol,
1114                     'preference': preference,
1115                 }
1116                 resolution = last_info.get('RESOLUTION')
1117                 if resolution:
1118                     width_str, height_str = resolution.split('x')
1119                     f['width'] = int(width_str)
1120                     f['height'] = int(height_str)
1121                 codecs = last_info.get('CODECS')
1122                 if codecs:
1123                     vcodec, acodec = [None] * 2
1124                     va_codecs = codecs.split(',')
1125                     if len(va_codecs) == 1:
1126                         # Audio only entries usually come with single codec and
1127                         # no resolution. For more robustness we also check it to
1128                         # be mp4 audio.
1129                         if not resolution and va_codecs[0].startswith('mp4a'):
1130                             vcodec, acodec = 'none', va_codecs[0]
1131                         else:
1132                             vcodec = va_codecs[0]
1133                     else:
1134                         vcodec, acodec = va_codecs[:2]
1135                     f.update({
1136                         'acodec': acodec,
1137                         'vcodec': vcodec,
1138                     })
1139                 if last_media is not None:
1140                     f['m3u8_media'] = last_media
1141                     last_media = None
1142                 formats.append(f)
1143                 last_info = {}
1144         return formats
1145
1146     @staticmethod
1147     def _xpath_ns(path, namespace=None):
1148         if not namespace:
1149             return path
1150         out = []
1151         for c in path.split('/'):
1152             if not c or c == '.':
1153                 out.append(c)
1154             else:
1155                 out.append('{%s}%s' % (namespace, c))
1156         return '/'.join(out)
1157
1158     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1159         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1160
1161         if smil is False:
1162             assert not fatal
1163             return []
1164
1165         namespace = self._parse_smil_namespace(smil)
1166
1167         return self._parse_smil_formats(
1168             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1169
1170     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1171         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1172         if smil is False:
1173             return {}
1174         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1175
1176     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1177         return self._download_xml(
1178             smil_url, video_id, 'Downloading SMIL file',
1179             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1180
1181     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1182         namespace = self._parse_smil_namespace(smil)
1183
1184         formats = self._parse_smil_formats(
1185             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1186         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1187
1188         video_id = os.path.splitext(url_basename(smil_url))[0]
1189         title = None
1190         description = None
1191         upload_date = None
1192         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1193             name = meta.attrib.get('name')
1194             content = meta.attrib.get('content')
1195             if not name or not content:
1196                 continue
1197             if not title and name == 'title':
1198                 title = content
1199             elif not description and name in ('description', 'abstract'):
1200                 description = content
1201             elif not upload_date and name == 'date':
1202                 upload_date = unified_strdate(content)
1203
1204         thumbnails = [{
1205             'id': image.get('type'),
1206             'url': image.get('src'),
1207             'width': int_or_none(image.get('width')),
1208             'height': int_or_none(image.get('height')),
1209         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1210
1211         return {
1212             'id': video_id,
1213             'title': title or video_id,
1214             'description': description,
1215             'upload_date': upload_date,
1216             'thumbnails': thumbnails,
1217             'formats': formats,
1218             'subtitles': subtitles,
1219         }
1220
1221     def _parse_smil_namespace(self, smil):
1222         return self._search_regex(
1223             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1224
1225     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1226         base = smil_url
1227         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1228             b = meta.get('base') or meta.get('httpBase')
1229             if b:
1230                 base = b
1231                 break
1232
1233         formats = []
1234         rtmp_count = 0
1235         http_count = 0
1236         m3u8_count = 0
1237
1238         srcs = []
1239         videos = smil.findall(self._xpath_ns('.//video', namespace))
1240         for video in videos:
1241             src = video.get('src')
1242             if not src or src in srcs:
1243                 continue
1244             srcs.append(src)
1245
1246             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1247             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1248             width = int_or_none(video.get('width'))
1249             height = int_or_none(video.get('height'))
1250             proto = video.get('proto')
1251             ext = video.get('ext')
1252             src_ext = determine_ext(src)
1253             streamer = video.get('streamer') or base
1254
1255             if proto == 'rtmp' or streamer.startswith('rtmp'):
1256                 rtmp_count += 1
1257                 formats.append({
1258                     'url': streamer,
1259                     'play_path': src,
1260                     'ext': 'flv',
1261                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1262                     'tbr': bitrate,
1263                     'filesize': filesize,
1264                     'width': width,
1265                     'height': height,
1266                 })
1267                 if transform_rtmp_url:
1268                     streamer, src = transform_rtmp_url(streamer, src)
1269                     formats[-1].update({
1270                         'url': streamer,
1271                         'play_path': src,
1272                     })
1273                 continue
1274
1275             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1276             src_url = src_url.strip()
1277
1278             if proto == 'm3u8' or src_ext == 'm3u8':
1279                 m3u8_formats = self._extract_m3u8_formats(
1280                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1281                 if len(m3u8_formats) == 1:
1282                     m3u8_count += 1
1283                     m3u8_formats[0].update({
1284                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1285                         'tbr': bitrate,
1286                         'width': width,
1287                         'height': height,
1288                     })
1289                 formats.extend(m3u8_formats)
1290                 continue
1291
1292             if src_ext == 'f4m':
1293                 f4m_url = src_url
1294                 if not f4m_params:
1295                     f4m_params = {
1296                         'hdcore': '3.2.0',
1297                         'plugin': 'flowplayer-3.2.0.1',
1298                     }
1299                 f4m_url += '&' if '?' in f4m_url else '?'
1300                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1301                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1302                 continue
1303
1304             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1305                 http_count += 1
1306                 formats.append({
1307                     'url': src_url,
1308                     'ext': ext or src_ext or 'flv',
1309                     'format_id': 'http-%d' % (bitrate or http_count),
1310                     'tbr': bitrate,
1311                     'filesize': filesize,
1312                     'width': width,
1313                     'height': height,
1314                 })
1315                 continue
1316
1317         return formats
1318
1319     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1320         urls = []
1321         subtitles = {}
1322         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1323             src = textstream.get('src')
1324             if not src or src in urls:
1325                 continue
1326             urls.append(src)
1327             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1328             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1329             subtitles.setdefault(lang, []).append({
1330                 'url': src,
1331                 'ext': ext,
1332             })
1333         return subtitles
1334
1335     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1336         xspf = self._download_xml(
1337             playlist_url, playlist_id, 'Downloading xpsf playlist',
1338             'Unable to download xspf manifest', fatal=fatal)
1339         if xspf is False:
1340             return []
1341         return self._parse_xspf(xspf, playlist_id)
1342
1343     def _parse_xspf(self, playlist, playlist_id):
1344         NS_MAP = {
1345             'xspf': 'http://xspf.org/ns/0/',
1346             's1': 'http://static.streamone.nl/player/ns/0',
1347         }
1348
1349         entries = []
1350         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1351             title = xpath_text(
1352                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1353             description = xpath_text(
1354                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1355             thumbnail = xpath_text(
1356                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1357             duration = float_or_none(
1358                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1359
1360             formats = [{
1361                 'url': location.text,
1362                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1363                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1364                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1365             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1366             self._sort_formats(formats)
1367
1368             entries.append({
1369                 'id': playlist_id,
1370                 'title': title,
1371                 'description': description,
1372                 'thumbnail': thumbnail,
1373                 'duration': duration,
1374                 'formats': formats,
1375             })
1376         return entries
1377
1378     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1379         res = self._download_webpage_handle(
1380             mpd_url, video_id,
1381             note=note or 'Downloading MPD manifest',
1382             errnote=errnote or 'Failed to download MPD manifest',
1383             fatal=fatal)
1384         if res is False:
1385             return []
1386         mpd, urlh = res
1387         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1388
1389         return self._parse_mpd_formats(
1390             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1391
1392     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1393         if mpd_doc.get('type') == 'dynamic':
1394             return []
1395
1396         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1397
1398         def _add_ns(path):
1399             return self._xpath_ns(path, namespace)
1400
1401         def is_drm_protected(element):
1402             return element.find(_add_ns('ContentProtection')) is not None
1403
1404         def extract_multisegment_info(element, ms_parent_info):
1405             ms_info = ms_parent_info.copy()
1406             segment_list = element.find(_add_ns('SegmentList'))
1407             if segment_list is not None:
1408                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1409                 if segment_urls_e:
1410                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1411                 initialization = segment_list.find(_add_ns('Initialization'))
1412                 if initialization is not None:
1413                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1414             else:
1415                 segment_template = element.find(_add_ns('SegmentTemplate'))
1416                 if segment_template is not None:
1417                     start_number = segment_template.get('startNumber')
1418                     if start_number:
1419                         ms_info['start_number'] = int(start_number)
1420                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1421                     if segment_timeline is not None:
1422                         s_e = segment_timeline.findall(_add_ns('S'))
1423                         if s_e:
1424                             ms_info['total_number'] = 0
1425                             for s in s_e:
1426                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1427                     else:
1428                         timescale = segment_template.get('timescale')
1429                         if timescale:
1430                             ms_info['timescale'] = int(timescale)
1431                         segment_duration = segment_template.get('duration')
1432                         if segment_duration:
1433                             ms_info['segment_duration'] = int(segment_duration)
1434                     media_template = segment_template.get('media')
1435                     if media_template:
1436                         ms_info['media_template'] = media_template
1437                     initialization = segment_template.get('initialization')
1438                     if initialization:
1439                         ms_info['initialization_url'] = initialization
1440                     else:
1441                         initialization = segment_template.find(_add_ns('Initialization'))
1442                         if initialization is not None:
1443                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1444             return ms_info
1445
1446         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1447         formats = []
1448         for period in mpd_doc.findall(_add_ns('Period')):
1449             period_duration = parse_duration(period.get('duration')) or mpd_duration
1450             period_ms_info = extract_multisegment_info(period, {
1451                 'start_number': 1,
1452                 'timescale': 1,
1453             })
1454             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1455                 if is_drm_protected(adaptation_set):
1456                     continue
1457                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1458                 for representation in adaptation_set.findall(_add_ns('Representation')):
1459                     if is_drm_protected(representation):
1460                         continue
1461                     representation_attrib = adaptation_set.attrib.copy()
1462                     representation_attrib.update(representation.attrib)
1463                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1464                     mime_type = representation_attrib['mimeType']
1465                     content_type = mime_type.split('/')[0]
1466                     if content_type == 'text':
1467                         # TODO implement WebVTT downloading
1468                         pass
1469                     elif content_type == 'video' or content_type == 'audio':
1470                         base_url = ''
1471                         for element in (representation, adaptation_set, period, mpd_doc):
1472                             base_url_e = element.find(_add_ns('BaseURL'))
1473                             if base_url_e is not None:
1474                                 base_url = base_url_e.text + base_url
1475                                 if re.match(r'^https?://', base_url):
1476                                     break
1477                         if mpd_base_url and not re.match(r'^https?://', base_url):
1478                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1479                                 mpd_base_url += '/'
1480                             base_url = mpd_base_url + base_url
1481                         representation_id = representation_attrib.get('id')
1482                         lang = representation_attrib.get('lang')
1483                         url_el = representation.find(_add_ns('BaseURL'))
1484                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1485                         f = {
1486                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1487                             'url': base_url,
1488                             'ext': mimetype2ext(mime_type),
1489                             'width': int_or_none(representation_attrib.get('width')),
1490                             'height': int_or_none(representation_attrib.get('height')),
1491                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1492                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1493                             'fps': int_or_none(representation_attrib.get('frameRate')),
1494                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1495                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1496                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1497                             'format_note': 'DASH %s' % content_type,
1498                             'filesize': filesize,
1499                         }
1500                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1501                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1502                             if 'total_number' not in representation_ms_info and 'segment_duration':
1503                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1504                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1505                             media_template = representation_ms_info['media_template']
1506                             media_template = media_template.replace('$RepresentationID$', representation_id)
1507                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1508                             media_template.replace('$$', '$')
1509                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1510                         if 'segment_urls' in representation_ms_info:
1511                             f.update({
1512                                 'segment_urls': representation_ms_info['segment_urls'],
1513                                 'protocol': 'http_dash_segments',
1514                             })
1515                             if 'initialization_url' in representation_ms_info:
1516                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1517                                 f.update({
1518                                     'initialization_url': initialization_url,
1519                                 })
1520                                 if not f.get('url'):
1521                                     f['url'] = initialization_url
1522                         try:
1523                             existing_format = next(
1524                                 fo for fo in formats
1525                                 if fo['format_id'] == representation_id)
1526                         except StopIteration:
1527                             full_info = formats_dict.get(representation_id, {}).copy()
1528                             full_info.update(f)
1529                             formats.append(full_info)
1530                         else:
1531                             existing_format.update(f)
1532                     else:
1533                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1534         return formats
1535
1536     def _live_title(self, name):
1537         """ Generate the title for a live video """
1538         now = datetime.datetime.now()
1539         now_str = now.strftime('%Y-%m-%d %H:%M')
1540         return name + ' ' + now_str
1541
1542     def _int(self, v, name, fatal=False, **kwargs):
1543         res = int_or_none(v, **kwargs)
1544         if 'get_attr' in kwargs:
1545             print(getattr(v, kwargs['get_attr']))
1546         if res is None:
1547             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1548             if fatal:
1549                 raise ExtractorError(msg)
1550             else:
1551                 self._downloader.report_warning(msg)
1552         return res
1553
1554     def _float(self, v, name, fatal=False, **kwargs):
1555         res = float_or_none(v, **kwargs)
1556         if res is None:
1557             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1558             if fatal:
1559                 raise ExtractorError(msg)
1560             else:
1561                 self._downloader.report_warning(msg)
1562         return res
1563
1564     def _set_cookie(self, domain, name, value, expire_time=None):
1565         cookie = compat_cookiejar.Cookie(
1566             0, name, value, None, None, domain, None,
1567             None, '/', True, False, expire_time, '', None, None, None)
1568         self._downloader.cookiejar.set_cookie(cookie)
1569
1570     def _get_cookies(self, url):
1571         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1572         req = sanitized_Request(url)
1573         self._downloader.cookiejar.add_cookie_header(req)
1574         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1575
1576     def get_testcases(self, include_onlymatching=False):
1577         t = getattr(self, '_TEST', None)
1578         if t:
1579             assert not hasattr(self, '_TESTS'), \
1580                 '%s has _TEST and _TESTS' % type(self).__name__
1581             tests = [t]
1582         else:
1583             tests = getattr(self, '_TESTS', [])
1584         for t in tests:
1585             if not include_onlymatching and t.get('only_matching', False):
1586                 continue
1587             t['name'] = type(self).__name__[:-len('IE')]
1588             yield t
1589
1590     def is_suitable(self, age_limit):
1591         """ Test whether the extractor is generally suitable for the given
1592         age limit (i.e. pornographic sites are not, all others usually are) """
1593
1594         any_restricted = False
1595         for tc in self.get_testcases(include_onlymatching=False):
1596             if 'playlist' in tc:
1597                 tc = tc['playlist'][0]
1598             is_restricted = age_restricted(
1599                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1600             if not is_restricted:
1601                 return True
1602             any_restricted = any_restricted or is_restricted
1603         return not any_restricted
1604
1605     def extract_subtitles(self, *args, **kwargs):
1606         if (self._downloader.params.get('writesubtitles', False) or
1607                 self._downloader.params.get('listsubtitles')):
1608             return self._get_subtitles(*args, **kwargs)
1609         return {}
1610
1611     def _get_subtitles(self, *args, **kwargs):
1612         raise NotImplementedError('This method must be implemented by subclasses')
1613
1614     @staticmethod
1615     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1616         """ Merge subtitle items for one language. Items with duplicated URLs
1617         will be dropped. """
1618         list1_urls = set([item['url'] for item in subtitle_list1])
1619         ret = list(subtitle_list1)
1620         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1621         return ret
1622
1623     @classmethod
1624     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1625         """ Merge two subtitle dictionaries, language by language. """
1626         ret = dict(subtitle_dict1)
1627         for lang in subtitle_dict2:
1628             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1629         return ret
1630
1631     def extract_automatic_captions(self, *args, **kwargs):
1632         if (self._downloader.params.get('writeautomaticsub', False) or
1633                 self._downloader.params.get('listsubtitles')):
1634             return self._get_automatic_captions(*args, **kwargs)
1635         return {}
1636
1637     def _get_automatic_captions(self, *args, **kwargs):
1638         raise NotImplementedError('This method must be implemented by subclasses')
1639
1640     def mark_watched(self, *args, **kwargs):
1641         if (self._downloader.params.get('mark_watched', False) and
1642                 (self._get_login_info()[0] is not None or
1643                     self._downloader.params.get('cookiefile') is not None)):
1644             self._mark_watched(*args, **kwargs)
1645
1646     def _mark_watched(self, *args, **kwargs):
1647         raise NotImplementedError('This method must be implemented by subclasses')
1648
1649
1650 class SearchInfoExtractor(InfoExtractor):
1651     """
1652     Base class for paged search queries extractors.
1653     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1654     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1655     """
1656
1657     @classmethod
1658     def _make_valid_url(cls):
1659         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1660
1661     @classmethod
1662     def suitable(cls, url):
1663         return re.match(cls._make_valid_url(), url) is not None
1664
1665     def _real_extract(self, query):
1666         mobj = re.match(self._make_valid_url(), query)
1667         if mobj is None:
1668             raise ExtractorError('Invalid search query "%s"' % query)
1669
1670         prefix = mobj.group('prefix')
1671         query = mobj.group('query')
1672         if prefix == '':
1673             return self._get_n_results(query, 1)
1674         elif prefix == 'all':
1675             return self._get_n_results(query, self._MAX_RESULTS)
1676         else:
1677             n = int(prefix)
1678             if n <= 0:
1679                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1680             elif n > self._MAX_RESULTS:
1681                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1682                 n = self._MAX_RESULTS
1683             return self._get_n_results(query, n)
1684
1685     def _get_n_results(self, query, n):
1686         """Get a specified number of results for a query"""
1687         raise NotImplementedError('This method must be implemented by subclasses')
1688
1689     @property
1690     def SEARCH_KEY(self):
1691         return self._SEARCH_KEY