git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urlparse,
  26 )
  27 from ..downloader.f4m import remove_encrypted_media
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     error_to_compat_str,
  36     ExtractorError,
  37     fix_xml_ampersands,
  38     float_or_none,
  39     int_or_none,
  40     parse_iso8601,
  41     RegexNotFoundError,
  42     sanitize_filename,
  43     sanitized_Request,
  44     unescapeHTML,
  45     unified_strdate,
  46     url_basename,
  47     xpath_text,
  48     xpath_with_ns,
  49     determine_protocol,
  50     parse_duration,
  51     mimetype2ext,
  52     update_url_query,
  53 )
  54
  55
  56 class InfoExtractor(object):
  57     """Information Extractor class.
  58
  59     Information extractors are the classes that, given a URL, extract
  60     information about the video (or videos) the URL refers to. This
  61     information includes the real video URL, the video title, author and
  62     others. The information is stored in a dictionary which is then
  63     passed to the YoutubeDL. The YoutubeDL processes this
  64     information possibly downloading the video to the file system, among
  65     other possible outcomes.
  66
  67     The type field determines the type of the result.
  68     By far the most common value (and the default if _type is missing) is
  69     "video", which indicates a single video.
  70
  71     For a video, the dictionaries must include the following fields:
  72
  73     id:             Video identifier.
  74     title:          Video title, unescaped.
  75
  76     Additionally, it must contain either a formats entry or a url one:
  77
  78     formats:        A list of dictionaries for each format available, ordered
  79                     from worst to best quality.
  80
  81                     Potential fields:
  82                     * url        Mandatory. The URL of the video file
  83                     * ext        Will be calculated from URL if missing
  84                     * format     A human-readable description of the format
  85                                  ("mp4 container with h264/opus").
  86                                  Calculated from the format_id, width, height.
  87                                  and format_note fields if missing.
  88                     * format_id  A short description of the format
  89                                  ("mp4_h264_opus" or "19").
  90                                 Technically optional, but strongly recommended.
  91                     * format_note Additional info about the format
  92                                  ("3D" or "DASH video")
  93                     * width      Width of the video, if known
  94                     * height     Height of the video, if known
  95                     * resolution Textual description of width and height
  96                     * tbr        Average bitrate of audio and video in KBit/s
  97                     * abr        Average audio bitrate in KBit/s
  98                     * acodec     Name of the audio codec in use
  99                     * asr        Audio sampling rate in Hertz
 100                     * vbr        Average video bitrate in KBit/s
 101                     * fps        Frame rate
 102                     * vcodec     Name of the video codec in use
 103                     * container  Name of the container format
 104                     * filesize   The number of bytes, if known in advance
 105                     * filesize_approx  An estimate for the number of bytes
 106                     * player_url SWF Player URL (used for rtmpdump).
 107                     * protocol   The protocol that will be used for the actual
 108                                  download, lower-case.
 109                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 110                                  "m3u8", "m3u8_native" or "http_dash_segments".
 111                     * preference Order number of this format. If this field is
 112                                  present and not None, the formats get sorted
 113                                  by this field, regardless of all other values.
 114                                  -1 for default (order by other properties),
 115                                  -2 or smaller for less than default.
 116                                  < -1000 to hide the format (if there is
 117                                     another one which is strictly better)
 118                     * language   Language code, e.g. "de" or "en-US".
 119                     * language_preference  Is this in the language mentioned in
 120                                  the URL?
 121                                  10 if it's what the URL is about,
 122                                  -1 for default (don't know),
 123                                  -10 otherwise, other values reserved for now.
 124                     * quality    Order number of the video quality of this
 125                                  format, irrespective of the file format.
 126                                  -1 for default (order by other properties),
 127                                  -2 or smaller for less than default.
 128                     * source_preference  Order number for this video source
 129                                   (quality takes higher priority)
 130                                  -1 for default (order by other properties),
 131                                  -2 or smaller for less than default.
 132                     * http_headers  A dictionary of additional HTTP headers
 133                                  to add to the request.
 134                     * stretched_ratio  If given and not 1, indicates that the
 135                                  video's pixels are not square.
 136                                  width : height ratio as float.
 137                     * no_resume  The server does not support resuming the
 138                                  (HTTP or RTMP) download. Boolean.
 139
 140     url:            Final video URL.
 141     ext:            Video filename extension.
 142     format:         The video format, defaults to ext (used for --get-format)
 143     player_url:     SWF Player URL (used for rtmpdump).
 144
 145     The following fields are optional:
 146
 147     alt_title:      A secondary title of the video.
 148     display_id      An alternative identifier for the video, not necessarily
 149                     unique, but available before title. Typically, id is
 150                     something like "4234987", title "Dancing naked mole rats",
 151                     and display_id "dancing-naked-mole-rats"
 152     thumbnails:     A list of dictionaries, with the following entries:
 153                         * "id" (optional, string) - Thumbnail format ID
 154                         * "url"
 155                         * "preference" (optional, int) - quality of the image
 156                         * "width" (optional, int)
 157                         * "height" (optional, int)
 158                         * "resolution" (optional, string "{width}x{height"},
 159                                         deprecated)
 160     thumbnail:      Full URL to a video thumbnail image.
 161     description:    Full video description.
 162     uploader:       Full name of the video uploader.
 163     license:        License name the video is licensed under.
 164     creator:        The main artist who created the video.
 165     release_date:   The date (YYYYMMDD) when the video was released.
 166     timestamp:      UNIX timestamp of the moment the video became available.
 167     upload_date:    Video upload date (YYYYMMDD).
 168                     If not explicitly set, calculated from timestamp.
 169     uploader_id:    Nickname or id of the video uploader.
 170     uploader_url:   Full URL to a personal webpage of the video uploader.
 171     location:       Physical location where the video was filmed.
 172     subtitles:      The available subtitles as a dictionary in the format
 173                     {language: subformats}. "subformats" is a list sorted from
 174                     lower to higher preference, each element is a dictionary
 175                     with the "ext" entry and one of:
 176                         * "data": The subtitles file contents
 177                         * "url": A URL pointing to the subtitles file
 178                     "ext" will be calculated from URL if missing
 179     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 180                     automatically generated captions
 181     duration:       Length of the video in seconds, as an integer or float.
 182     view_count:     How many users have watched the video on the platform.
 183     like_count:     Number of positive ratings of the video
 184     dislike_count:  Number of negative ratings of the video
 185     repost_count:   Number of reposts of the video
 186     average_rating: Average rating give by users, the scale used depends on the webpage
 187     comment_count:  Number of comments on the video
 188     comments:       A list of comments, each with one or more of the following
 189                     properties (all but one of text or html optional):
 190                         * "author" - human-readable name of the comment author
 191                         * "author_id" - user ID of the comment author
 192                         * "id" - Comment ID
 193                         * "html" - Comment as HTML
 194                         * "text" - Plain text of the comment
 195                         * "timestamp" - UNIX timestamp of comment
 196                         * "parent" - ID of the comment this one is replying to.
 197                                      Set to "root" to indicate that this is a
 198                                      comment to the original video.
 199     age_limit:      Age restriction for the video, as an integer (years)
 200     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 201                     should allow to get the same result again. (It will be set
 202                     by YoutubeDL if it's missing)
 203     categories:     A list of categories that the video falls in, for example
 204                     ["Sports", "Berlin"]
 205     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 206     is_live:        True, False, or None (=unknown). Whether this video is a
 207                     live stream that goes on instead of a fixed-length video.
 208     start_time:     Time in seconds where the reproduction should start, as
 209                     specified in the URL.
 210     end_time:       Time in seconds where the reproduction should end, as
 211                     specified in the URL.
 212
 213     The following fields should only be used when the video belongs to some logical
 214     chapter or section:
 215
 216     chapter:        Name or title of the chapter the video belongs to.
 217     chapter_number: Number of the chapter the video belongs to, as an integer.
 218     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 219
 220     The following fields should only be used when the video is an episode of some
 221     series or programme:
 222
 223     series:         Title of the series or programme the video episode belongs to.
 224     season:         Title of the season the video episode belongs to.
 225     season_number:  Number of the season the video episode belongs to, as an integer.
 226     season_id:      Id of the season the video episode belongs to, as a unicode string.
 227     episode:        Title of the video episode. Unlike mandatory video title field,
 228                     this field should denote the exact title of the video episode
 229                     without any kind of decoration.
 230     episode_number: Number of the video episode within a season, as an integer.
 231     episode_id:     Id of the video episode, as a unicode string.
 232
 233     Unless mentioned otherwise, the fields should be Unicode strings.
 234
 235     Unless mentioned otherwise, None is equivalent to absence of information.
 236
 237
 238     _type "playlist" indicates multiple videos.
 239     There must be a key "entries", which is a list, an iterable, or a PagedList
 240     object, each element of which is a valid dictionary by this specification.
 241
 242     Additionally, playlists can have "title", "description" and "id" attributes
 243     with the same semantics as videos (see above).
 244
 245
 246     _type "multi_video" indicates that there are multiple videos that
 247     form a single show, for examples multiple acts of an opera or TV episode.
 248     It must have an entries key like a playlist and contain all the keys
 249     required for a video at the same time.
 250
 251
 252     _type "url" indicates that the video must be extracted from another
 253     location, possibly by a different extractor. Its only required key is:
 254     "url" - the next URL to extract.
 255     The key "ie_key" can be set to the class name (minus the trailing "IE",
 256     e.g. "Youtube") if the extractor class is known in advance.
 257     Additionally, the dictionary may have any properties of the resolved entity
 258     known in advance, for example "title" if the title of the referred video is
 259     known ahead of time.
 260
 261
 262     _type "url_transparent" entities have the same specification as "url", but
 263     indicate that the given additional information is more precise than the one
 264     associated with the resolved URL.
 265     This is useful when a site employs a video service that hosts the video and
 266     its technical metadata, but that video service does not embed a useful
 267     title, description etc.
 268
 269
 270     Subclasses of this one should re-define the _real_initialize() and
 271     _real_extract() methods and define a _VALID_URL regexp.
 272     Probably, they should also be added to the list of extractors.
 273
 274     Finally, the _WORKING attribute should be set to False for broken IEs
 275     in order to warn the users and skip the tests.
 276     """
 277
 278     _ready = False
 279     _downloader = None
 280     _WORKING = True
 281
 282     def __init__(self, downloader=None):
 283         """Constructor. Receives an optional downloader."""
 284         self._ready = False
 285         self.set_downloader(downloader)
 286
 287     @classmethod
 288     def suitable(cls, url):
 289         """Receives a URL and returns True if suitable for this IE."""
 290
 291         # This does not use has/getattr intentionally - we want to know whether
 292         # we have cached the regexp for *this* class, whereas getattr would also
 293         # match the superclass
 294         if '_VALID_URL_RE' not in cls.__dict__:
 295             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 296         return cls._VALID_URL_RE.match(url) is not None
 297
 298     @classmethod
 299     def _match_id(cls, url):
 300         if '_VALID_URL_RE' not in cls.__dict__:
 301             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 302         m = cls._VALID_URL_RE.match(url)
 303         assert m
 304         return m.group('id')
 305
 306     @classmethod
 307     def working(cls):
 308         """Getter method for _WORKING."""
 309         return cls._WORKING
 310
 311     def initialize(self):
 312         """Initializes an instance (authentication, etc)."""
 313         if not self._ready:
 314             self._real_initialize()
 315             self._ready = True
 316
 317     def extract(self, url):
 318         """Extracts URL information and returns it in list of dicts."""
 319         try:
 320             self.initialize()
 321             return self._real_extract(url)
 322         except ExtractorError:
 323             raise
 324         except compat_http_client.IncompleteRead as e:
 325             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 326         except (KeyError, StopIteration) as e:
 327             raise ExtractorError('An extractor error has occurred.', cause=e)
 328
 329     def set_downloader(self, downloader):
 330         """Sets the downloader for this IE."""
 331         self._downloader = downloader
 332
 333     def _real_initialize(self):
 334         """Real initialization process. Redefine in subclasses."""
 335         pass
 336
 337     def _real_extract(self, url):
 338         """Real extraction process. Redefine in subclasses."""
 339         pass
 340
 341     @classmethod
 342     def ie_key(cls):
 343         """A string for getting the InfoExtractor with get_info_extractor"""
 344         return compat_str(cls.__name__[:-2])
 345
 346     @property
 347     def IE_NAME(self):
 348         return compat_str(type(self).__name__[:-2])
 349
 350     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
 351         """ Returns the response handle """
 352         if note is None:
 353             self.report_download_webpage(video_id)
 354         elif note is not False:
 355             if video_id is None:
 356                 self.to_screen('%s' % (note,))
 357             else:
 358                 self.to_screen('%s: %s' % (video_id, note))
 359         # data, headers and query params will be ignored for `Request` objects
 360         if isinstance(url_or_request, compat_str):
 361             if query:
 362                 url_or_request = update_url_query(url_or_request, query)
 363             if data or headers:
 364                 url_or_request = sanitized_Request(url_or_request, data, headers or {})
 365         try:
 366             return self._downloader.urlopen(url_or_request)
 367         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 368             if errnote is False:
 369                 return False
 370             if errnote is None:
 371                 errnote = 'Unable to download webpage'
 372
 373             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 374             if fatal:
 375                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 376             else:
 377                 self._downloader.report_warning(errmsg)
 378                 return False
 379
 380     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 381         """ Returns a tuple (page content as string, URL handle) """
 382         # Strip hashes from the URL (#1038)
 383         if isinstance(url_or_request, (compat_str, str)):
 384             url_or_request = url_or_request.partition('#')[0]
 385
 386         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 387         if urlh is False:
 388             assert not fatal
 389             return False
 390         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 391         return (content, urlh)
 392
 393     @staticmethod
 394     def _guess_encoding_from_content(content_type, webpage_bytes):
 395         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 396         if m:
 397             encoding = m.group(1)
 398         else:
 399             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 400                           webpage_bytes[:1024])
 401             if m:
 402                 encoding = m.group(1).decode('ascii')
 403             elif webpage_bytes.startswith(b'\xff\xfe'):
 404                 encoding = 'utf-16'
 405             else:
 406                 encoding = 'utf-8'
 407
 408         return encoding
 409
 410     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 411         content_type = urlh.headers.get('Content-Type', '')
 412         webpage_bytes = urlh.read()
 413         if prefix is not None:
 414             webpage_bytes = prefix + webpage_bytes
 415         if not encoding:
 416             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 417         if self._downloader.params.get('dump_intermediate_pages', False):
 418             try:
 419                 url = url_or_request.get_full_url()
 420             except AttributeError:
 421                 url = url_or_request
 422             self.to_screen('Dumping request to ' + url)
 423             dump = base64.b64encode(webpage_bytes).decode('ascii')
 424             self._downloader.to_screen(dump)
 425         if self._downloader.params.get('write_pages', False):
 426             try:
 427                 url = url_or_request.get_full_url()
 428             except AttributeError:
 429                 url = url_or_request
 430             basen = '%s_%s' % (video_id, url)
 431             if len(basen) > 240:
 432                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 433                 basen = basen[:240 - len(h)] + h
 434             raw_filename = basen + '.dump'
 435             filename = sanitize_filename(raw_filename, restricted=True)
 436             self.to_screen('Saving request to ' + filename)
 437             # Working around MAX_PATH limitation on Windows (see
 438             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 439             if compat_os_name == 'nt':
 440                 absfilepath = os.path.abspath(filename)
 441                 if len(absfilepath) > 259:
 442                     filename = '\\\\?\\' + absfilepath
 443             with open(filename, 'wb') as outf:
 444                 outf.write(webpage_bytes)
 445
 446         try:
 447             content = webpage_bytes.decode(encoding, 'replace')
 448         except LookupError:
 449             content = webpage_bytes.decode('utf-8', 'replace')
 450
 451         if ('<title>Access to this site is blocked</title>' in content and
 452                 'Websense' in content[:512]):
 453             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 454             blocked_iframe = self._html_search_regex(
 455                 r'<iframe src="([^"]+)"', content,
 456                 'Websense information URL', default=None)
 457             if blocked_iframe:
 458                 msg += ' Visit %s for more details' % blocked_iframe
 459             raise ExtractorError(msg, expected=True)
 460         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 461             msg = (
 462                 'Access to this webpage has been blocked by Indian censorship. '
 463                 'Use a VPN or proxy server (with --proxy) to route around it.')
 464             block_msg = self._html_search_regex(
 465                 r'</h1><p>(.*?)</p>',
 466                 content, 'block message', default=None)
 467             if block_msg:
 468                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 469             raise ExtractorError(msg, expected=True)
 470
 471         return content
 472
 473     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
 474         """ Returns the data of the page as a string """
 475         success = False
 476         try_count = 0
 477         while success is False:
 478             try:
 479                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 480                 success = True
 481             except compat_http_client.IncompleteRead as e:
 482                 try_count += 1
 483                 if try_count >= tries:
 484                     raise e
 485                 self._sleep(timeout, video_id)
 486         if res is False:
 487             return res
 488         else:
 489             content, _ = res
 490             return content
 491
 492     def _download_xml(self, url_or_request, video_id,
 493                       note='Downloading XML', errnote='Unable to download XML',
 494                       transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 495         """Return the xml as an xml.etree.ElementTree.Element"""
 496         xml_string = self._download_webpage(
 497             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 498         if xml_string is False:
 499             return xml_string
 500         if transform_source:
 501             xml_string = transform_source(xml_string)
 502         return compat_etree_fromstring(xml_string.encode('utf-8'))
 503
 504     def _download_json(self, url_or_request, video_id,
 505                        note='Downloading JSON metadata',
 506                        errnote='Unable to download JSON metadata',
 507                        transform_source=None,
 508                        fatal=True, encoding=None, data=None, headers=None, query=None):
 509         json_string = self._download_webpage(
 510             url_or_request, video_id, note, errnote, fatal=fatal,
 511             encoding=encoding, data=data, headers=headers, query=query)
 512         if (not fatal) and json_string is False:
 513             return None
 514         return self._parse_json(
 515             json_string, video_id, transform_source=transform_source, fatal=fatal)
 516
 517     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 518         if transform_source:
 519             json_string = transform_source(json_string)
 520         try:
 521             return json.loads(json_string)
 522         except ValueError as ve:
 523             errmsg = '%s: Failed to parse JSON ' % video_id
 524             if fatal:
 525                 raise ExtractorError(errmsg, cause=ve)
 526             else:
 527                 self.report_warning(errmsg + str(ve))
 528
 529     def report_warning(self, msg, video_id=None):
 530         idstr = '' if video_id is None else '%s: ' % video_id
 531         self._downloader.report_warning(
 532             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 533
 534     def to_screen(self, msg):
 535         """Print msg to screen, prefixing it with '[ie_name]'"""
 536         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 537
 538     def report_extraction(self, id_or_name):
 539         """Report information extraction."""
 540         self.to_screen('%s: Extracting information' % id_or_name)
 541
 542     def report_download_webpage(self, video_id):
 543         """Report webpage download."""
 544         self.to_screen('%s: Downloading webpage' % video_id)
 545
 546     def report_age_confirmation(self):
 547         """Report attempt to confirm age."""
 548         self.to_screen('Confirming age')
 549
 550     def report_login(self):
 551         """Report attempt to log in."""
 552         self.to_screen('Logging in')
 553
 554     @staticmethod
 555     def raise_login_required(msg='This video is only available for registered users'):
 556         raise ExtractorError(
 557             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 558             expected=True)
 559
 560     @staticmethod
 561     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 562         raise ExtractorError(
 563             '%s. You might want to use --proxy to workaround.' % msg,
 564             expected=True)
 565
 566     # Methods for following #608
 567     @staticmethod
 568     def url_result(url, ie=None, video_id=None, video_title=None):
 569         """Returns a URL that points to a page that should be processed"""
 570         # TODO: ie should be the class used for getting the info
 571         video_info = {'_type': 'url',
 572                       'url': url,
 573                       'ie_key': ie}
 574         if video_id is not None:
 575             video_info['id'] = video_id
 576         if video_title is not None:
 577             video_info['title'] = video_title
 578         return video_info
 579
 580     @staticmethod
 581     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 582         """Returns a playlist"""
 583         video_info = {'_type': 'playlist',
 584                       'entries': entries}
 585         if playlist_id:
 586             video_info['id'] = playlist_id
 587         if playlist_title:
 588             video_info['title'] = playlist_title
 589         if playlist_description:
 590             video_info['description'] = playlist_description
 591         return video_info
 592
 593     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 594         """
 595         Perform a regex search on the given string, using a single or a list of
 596         patterns returning the first matching group.
 597         In case of failure return a default value or raise a WARNING or a
 598         RegexNotFoundError, depending on fatal, specifying the field name.
 599         """
 600         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 601             mobj = re.search(pattern, string, flags)
 602         else:
 603             for p in pattern:
 604                 mobj = re.search(p, string, flags)
 605                 if mobj:
 606                     break
 607
 608         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 609             _name = '\033[0;34m%s\033[0m' % name
 610         else:
 611             _name = name
 612
 613         if mobj:
 614             if group is None:
 615                 # return the first matching group
 616                 return next(g for g in mobj.groups() if g is not None)
 617             else:
 618                 return mobj.group(group)
 619         elif default is not NO_DEFAULT:
 620             return default
 621         elif fatal:
 622             raise RegexNotFoundError('Unable to extract %s' % _name)
 623         else:
 624             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 625             return None
 626
 627     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 628         """
 629         Like _search_regex, but strips HTML tags and unescapes entities.
 630         """
 631         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 632         if res:
 633             return clean_html(res).strip()
 634         else:
 635             return res
 636
 637     def _get_login_info(self):
 638         """
 639         Get the login info as (username, password)
 640         It will look in the netrc file using the _NETRC_MACHINE value
 641         If there's no info available, return (None, None)
 642         """
 643         if self._downloader is None:
 644             return (None, None)
 645
 646         username = None
 647         password = None
 648         downloader_params = self._downloader.params
 649
 650         # Attempt to use provided username and password or .netrc data
 651         if downloader_params.get('username') is not None:
 652             username = downloader_params['username']
 653             password = downloader_params['password']
 654         elif downloader_params.get('usenetrc', False):
 655             try:
 656                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 657                 if info is not None:
 658                     username = info[0]
 659                     password = info[2]
 660                 else:
 661                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 662             except (IOError, netrc.NetrcParseError) as err:
 663                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 664
 665         return (username, password)
 666
 667     def _get_tfa_info(self, note='two-factor verification code'):
 668         """
 669         Get the two-factor authentication info
 670         TODO - asking the user will be required for sms/phone verify
 671         currently just uses the command line option
 672         If there's no info available, return None
 673         """
 674         if self._downloader is None:
 675             return None
 676         downloader_params = self._downloader.params
 677
 678         if downloader_params.get('twofactor') is not None:
 679             return downloader_params['twofactor']
 680
 681         return compat_getpass('Type %s and press [Return]: ' % note)
 682
 683     # Helper functions for extracting OpenGraph info
 684     @staticmethod
 685     def _og_regexes(prop):
 686         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 687         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 688                        % {'prop': re.escape(prop)})
 689         template = r'<meta[^>]+?%s[^>]+?%s'
 690         return [
 691             template % (property_re, content_re),
 692             template % (content_re, property_re),
 693         ]
 694
 695     @staticmethod
 696     def _meta_regex(prop):
 697         return r'''(?isx)<meta
 698                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 699                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 700
 701     def _og_search_property(self, prop, html, name=None, **kargs):
 702         if name is None:
 703             name = 'OpenGraph %s' % prop
 704         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 705         if escaped is None:
 706             return None
 707         return unescapeHTML(escaped)
 708
 709     def _og_search_thumbnail(self, html, **kargs):
 710         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 711
 712     def _og_search_description(self, html, **kargs):
 713         return self._og_search_property('description', html, fatal=False, **kargs)
 714
 715     def _og_search_title(self, html, **kargs):
 716         return self._og_search_property('title', html, **kargs)
 717
 718     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 719         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 720         if secure:
 721             regexes = self._og_regexes('video:secure_url') + regexes
 722         return self._html_search_regex(regexes, html, name, **kargs)
 723
 724     def _og_search_url(self, html, **kargs):
 725         return self._og_search_property('url', html, **kargs)
 726
 727     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 728         if display_name is None:
 729             display_name = name
 730         return self._html_search_regex(
 731             self._meta_regex(name),
 732             html, display_name, fatal=fatal, group='content', **kwargs)
 733
 734     def _dc_search_uploader(self, html):
 735         return self._html_search_meta('dc.creator', html, 'uploader')
 736
 737     def _rta_search(self, html):
 738         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 739         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 740                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 741                      html):
 742             return 18
 743         return 0
 744
 745     def _media_rating_search(self, html):
 746         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 747         rating = self._html_search_meta('rating', html)
 748
 749         if not rating:
 750             return None
 751
 752         RATING_TABLE = {
 753             'safe for kids': 0,
 754             'general': 8,
 755             '14 years': 14,
 756             'mature': 17,
 757             'restricted': 19,
 758         }
 759         return RATING_TABLE.get(rating.lower())
 760
 761     def _family_friendly_search(self, html):
 762         # See http://schema.org/VideoObject
 763         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 764
 765         if not family_friendly:
 766             return None
 767
 768         RATING_TABLE = {
 769             '1': 0,
 770             'true': 0,
 771             '0': 18,
 772             'false': 18,
 773         }
 774         return RATING_TABLE.get(family_friendly.lower())
 775
 776     def _twitter_search_player(self, html):
 777         return self._html_search_meta('twitter:player', html,
 778                                       'twitter card player')
 779
 780     def _search_json_ld(self, html, video_id, **kwargs):
 781         json_ld = self._search_regex(
 782             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 783             html, 'JSON-LD', group='json_ld', **kwargs)
 784         if not json_ld:
 785             return {}
 786         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 787
 788     def _json_ld(self, json_ld, video_id, fatal=True):
 789         if isinstance(json_ld, compat_str):
 790             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 791         if not json_ld:
 792             return {}
 793         info = {}
 794         if json_ld.get('@context') == 'http://schema.org':
 795             item_type = json_ld.get('@type')
 796             if item_type == 'TVEpisode':
 797                 info.update({
 798                     'episode': unescapeHTML(json_ld.get('name')),
 799                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 800                     'description': unescapeHTML(json_ld.get('description')),
 801                 })
 802                 part_of_season = json_ld.get('partOfSeason')
 803                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 804                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 805                 part_of_series = json_ld.get('partOfSeries')
 806                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 807                     info['series'] = unescapeHTML(part_of_series.get('name'))
 808             elif item_type == 'Article':
 809                 info.update({
 810                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 811                     'title': unescapeHTML(json_ld.get('headline')),
 812                     'description': unescapeHTML(json_ld.get('articleBody')),
 813                 })
 814         return dict((k, v) for k, v in info.items() if v is not None)
 815
 816     @staticmethod
 817     def _hidden_inputs(html):
 818         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 819         hidden_inputs = {}
 820         for input in re.findall(r'(?i)<input([^>]+)>', html):
 821             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 822                 continue
 823             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 824             if not name:
 825                 continue
 826             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 827             if not value:
 828                 continue
 829             hidden_inputs[name.group('value')] = value.group('value')
 830         return hidden_inputs
 831
 832     def _form_hidden_inputs(self, form_id, html):
 833         form = self._search_regex(
 834             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 835             html, '%s form' % form_id, group='form')
 836         return self._hidden_inputs(form)
 837
 838     def _sort_formats(self, formats, field_preference=None):
 839         if not formats:
 840             raise ExtractorError('No video formats found')
 841
 842         for f in formats:
 843             # Automatically determine tbr when missing based on abr and vbr (improves
 844             # formats sorting in some cases)
 845             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 846                 f['tbr'] = f['abr'] + f['vbr']
 847
 848         def _formats_key(f):
 849             # TODO remove the following workaround
 850             from ..utils import determine_ext
 851             if not f.get('ext') and 'url' in f:
 852                 f['ext'] = determine_ext(f['url'])
 853
 854             if isinstance(field_preference, (list, tuple)):
 855                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 856
 857             preference = f.get('preference')
 858             if preference is None:
 859                 preference = 0
 860                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 861                     preference -= 0.5
 862
 863             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 864
 865             if f.get('vcodec') == 'none':  # audio only
 866                 preference -= 50
 867                 if self._downloader.params.get('prefer_free_formats'):
 868                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 869                 else:
 870                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 871                 ext_preference = 0
 872                 try:
 873                     audio_ext_preference = ORDER.index(f['ext'])
 874                 except ValueError:
 875                     audio_ext_preference = -1
 876             else:
 877                 if f.get('acodec') == 'none':  # video only
 878                     preference -= 40
 879                 if self._downloader.params.get('prefer_free_formats'):
 880                     ORDER = ['flv', 'mp4', 'webm']
 881                 else:
 882                     ORDER = ['webm', 'flv', 'mp4']
 883                 try:
 884                     ext_preference = ORDER.index(f['ext'])
 885                 except ValueError:
 886                     ext_preference = -1
 887                 audio_ext_preference = 0
 888
 889             return (
 890                 preference,
 891                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 892                 f.get('quality') if f.get('quality') is not None else -1,
 893                 f.get('tbr') if f.get('tbr') is not None else -1,
 894                 f.get('filesize') if f.get('filesize') is not None else -1,
 895                 f.get('vbr') if f.get('vbr') is not None else -1,
 896                 f.get('height') if f.get('height') is not None else -1,
 897                 f.get('width') if f.get('width') is not None else -1,
 898                 proto_preference,
 899                 ext_preference,
 900                 f.get('abr') if f.get('abr') is not None else -1,
 901                 audio_ext_preference,
 902                 f.get('fps') if f.get('fps') is not None else -1,
 903                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 904                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 905                 f.get('format_id') if f.get('format_id') is not None else '',
 906             )
 907         formats.sort(key=_formats_key)
 908
 909     def _check_formats(self, formats, video_id):
 910         if formats:
 911             formats[:] = filter(
 912                 lambda f: self._is_valid_url(
 913                     f['url'], video_id,
 914                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 915                 formats)
 916
 917     @staticmethod
 918     def _remove_duplicate_formats(formats):
 919         format_urls = set()
 920         unique_formats = []
 921         for f in formats:
 922             if f['url'] not in format_urls:
 923                 format_urls.add(f['url'])
 924                 unique_formats.append(f)
 925         formats[:] = unique_formats
 926
 927     def _is_valid_url(self, url, video_id, item='video'):
 928         url = self._proto_relative_url(url, scheme='http:')
 929         # For now assume non HTTP(S) URLs always valid
 930         if not (url.startswith('http://') or url.startswith('https://')):
 931             return True
 932         try:
 933             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 934             return True
 935         except ExtractorError as e:
 936             if isinstance(e.cause, compat_urllib_error.URLError):
 937                 self.to_screen(
 938                     '%s: %s URL is invalid, skipping' % (video_id, item))
 939                 return False
 940             raise
 941
 942     def http_scheme(self):
 943         """ Either "http:" or "https:", depending on the user's preferences """
 944         return (
 945             'http:'
 946             if self._downloader.params.get('prefer_insecure', False)
 947             else 'https:')
 948
 949     def _proto_relative_url(self, url, scheme=None):
 950         if url is None:
 951             return url
 952         if url.startswith('//'):
 953             if scheme is None:
 954                 scheme = self.http_scheme()
 955             return scheme + url
 956         else:
 957             return url
 958
 959     def _sleep(self, timeout, video_id, msg_template=None):
 960         if msg_template is None:
 961             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 962         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 963         self.to_screen(msg)
 964         time.sleep(timeout)
 965
 966     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 967                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 968                              fatal=True):
 969         manifest = self._download_xml(
 970             manifest_url, video_id, 'Downloading f4m manifest',
 971             'Unable to download f4m manifest',
 972             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 973             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 974             transform_source=transform_source,
 975             fatal=fatal)
 976
 977         if manifest is False:
 978             return []
 979
 980         return self._parse_f4m_formats(
 981             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 982             transform_source=transform_source, fatal=fatal)
 983
 984     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 985                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 986                            fatal=True):
 987         formats = []
 988         manifest_version = '1.0'
 989         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 990         if not media_nodes:
 991             manifest_version = '2.0'
 992             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 993         # Remove unsupported DRM protected media from final formats
 994         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
 995         media_nodes = remove_encrypted_media(media_nodes)
 996         if not media_nodes:
 997             return formats
 998         base_url = xpath_text(
 999             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1000             'base URL', default=None)
1001         if base_url:
1002             base_url = base_url.strip()
1003         for i, media_el in enumerate(media_nodes):
1004             if manifest_version == '2.0':
1005                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1006                 if not media_url:
1007                     continue
1008                 manifest_url = (
1009                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1010                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1011                 # If media_url is itself a f4m manifest do the recursive extraction
1012                 # since bitrates in parent manifest (this one) and media_url manifest
1013                 # may differ leading to inability to resolve the format by requested
1014                 # bitrate in f4m downloader
1015                 if determine_ext(manifest_url) == 'f4m':
1016                     formats.extend(self._extract_f4m_formats(
1017                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1018                         transform_source=transform_source, fatal=fatal))
1019                     continue
1020             tbr = int_or_none(media_el.attrib.get('bitrate'))
1021             formats.append({
1022                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1023                 'url': manifest_url,
1024                 'ext': 'flv',
1025                 'tbr': tbr,
1026                 'width': int_or_none(media_el.attrib.get('width')),
1027                 'height': int_or_none(media_el.attrib.get('height')),
1028                 'preference': preference,
1029             })
1030         return formats
1031
1032     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1033                               entry_protocol='m3u8', preference=None,
1034                               m3u8_id=None, note=None, errnote=None,
1035                               fatal=True):
1036
1037         formats = [{
1038             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1039             'url': m3u8_url,
1040             'ext': ext,
1041             'protocol': 'm3u8',
1042             'preference': preference - 1 if preference else -1,
1043             'resolution': 'multiple',
1044             'format_note': 'Quality selection URL',
1045         }]
1046
1047         format_url = lambda u: (
1048             u
1049             if re.match(r'^https?://', u)
1050             else compat_urlparse.urljoin(m3u8_url, u))
1051
1052         res = self._download_webpage_handle(
1053             m3u8_url, video_id,
1054             note=note or 'Downloading m3u8 information',
1055             errnote=errnote or 'Failed to download m3u8 information',
1056             fatal=fatal)
1057         if res is False:
1058             return []
1059         m3u8_doc, urlh = res
1060         m3u8_url = urlh.geturl()
1061
1062         # We should try extracting formats only from master playlists [1], i.e.
1063         # playlists that describe available qualities. On the other hand media
1064         # playlists [2] should be returned as is since they contain just the media
1065         # without qualities renditions.
1066         # Fortunately, master playlist can be easily distinguished from media
1067         # playlist based on particular tags availability. As of [1, 2] master
1068         # playlist tags MUST NOT appear in a media playist and vice versa.
1069         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1070         # and MUST NOT appear in master playlist thus we can clearly detect media
1071         # playlist with this criterion.
1072         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1073         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1074         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1075         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1076             return [{
1077                 'url': m3u8_url,
1078                 'format_id': m3u8_id,
1079                 'ext': ext,
1080                 'protocol': entry_protocol,
1081                 'preference': preference,
1082             }]
1083         last_info = None
1084         last_media = None
1085         kv_rex = re.compile(
1086             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1087         for line in m3u8_doc.splitlines():
1088             if line.startswith('#EXT-X-STREAM-INF:'):
1089                 last_info = {}
1090                 for m in kv_rex.finditer(line):
1091                     v = m.group('val')
1092                     if v.startswith('"'):
1093                         v = v[1:-1]
1094                     last_info[m.group('key')] = v
1095             elif line.startswith('#EXT-X-MEDIA:'):
1096                 last_media = {}
1097                 for m in kv_rex.finditer(line):
1098                     v = m.group('val')
1099                     if v.startswith('"'):
1100                         v = v[1:-1]
1101                     last_media[m.group('key')] = v
1102             elif line.startswith('#') or not line.strip():
1103                 continue
1104             else:
1105                 if last_info is None:
1106                     formats.append({'url': format_url(line)})
1107                     continue
1108                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1109                 format_id = []
1110                 if m3u8_id:
1111                     format_id.append(m3u8_id)
1112                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1113                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1114                 f = {
1115                     'format_id': '-'.join(format_id),
1116                     'url': format_url(line.strip()),
1117                     'tbr': tbr,
1118                     'ext': ext,
1119                     'protocol': entry_protocol,
1120                     'preference': preference,
1121                 }
1122                 resolution = last_info.get('RESOLUTION')
1123                 if resolution:
1124                     width_str, height_str = resolution.split('x')
1125                     f['width'] = int(width_str)
1126                     f['height'] = int(height_str)
1127                 codecs = last_info.get('CODECS')
1128                 if codecs:
1129                     vcodec, acodec = [None] * 2
1130                     va_codecs = codecs.split(',')
1131                     if len(va_codecs) == 1:
1132                         # Audio only entries usually come with single codec and
1133                         # no resolution. For more robustness we also check it to
1134                         # be mp4 audio.
1135                         if not resolution and va_codecs[0].startswith('mp4a'):
1136                             vcodec, acodec = 'none', va_codecs[0]
1137                         else:
1138                             vcodec = va_codecs[0]
1139                     else:
1140                         vcodec, acodec = va_codecs[:2]
1141                     f.update({
1142                         'acodec': acodec,
1143                         'vcodec': vcodec,
1144                     })
1145                 if last_media is not None:
1146                     f['m3u8_media'] = last_media
1147                     last_media = None
1148                 formats.append(f)
1149                 last_info = {}
1150         return formats
1151
1152     @staticmethod
1153     def _xpath_ns(path, namespace=None):
1154         if not namespace:
1155             return path
1156         out = []
1157         for c in path.split('/'):
1158             if not c or c == '.':
1159                 out.append(c)
1160             else:
1161                 out.append('{%s}%s' % (namespace, c))
1162         return '/'.join(out)
1163
1164     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1165         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1166
1167         if smil is False:
1168             assert not fatal
1169             return []
1170
1171         namespace = self._parse_smil_namespace(smil)
1172
1173         return self._parse_smil_formats(
1174             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1175
1176     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1177         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1178         if smil is False:
1179             return {}
1180         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1181
1182     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1183         return self._download_xml(
1184             smil_url, video_id, 'Downloading SMIL file',
1185             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1186
1187     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1188         namespace = self._parse_smil_namespace(smil)
1189
1190         formats = self._parse_smil_formats(
1191             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1192         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1193
1194         video_id = os.path.splitext(url_basename(smil_url))[0]
1195         title = None
1196         description = None
1197         upload_date = None
1198         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1199             name = meta.attrib.get('name')
1200             content = meta.attrib.get('content')
1201             if not name or not content:
1202                 continue
1203             if not title and name == 'title':
1204                 title = content
1205             elif not description and name in ('description', 'abstract'):
1206                 description = content
1207             elif not upload_date and name == 'date':
1208                 upload_date = unified_strdate(content)
1209
1210         thumbnails = [{
1211             'id': image.get('type'),
1212             'url': image.get('src'),
1213             'width': int_or_none(image.get('width')),
1214             'height': int_or_none(image.get('height')),
1215         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1216
1217         return {
1218             'id': video_id,
1219             'title': title or video_id,
1220             'description': description,
1221             'upload_date': upload_date,
1222             'thumbnails': thumbnails,
1223             'formats': formats,
1224             'subtitles': subtitles,
1225         }
1226
1227     def _parse_smil_namespace(self, smil):
1228         return self._search_regex(
1229             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1230
1231     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1232         base = smil_url
1233         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1234             b = meta.get('base') or meta.get('httpBase')
1235             if b:
1236                 base = b
1237                 break
1238
1239         formats = []
1240         rtmp_count = 0
1241         http_count = 0
1242         m3u8_count = 0
1243
1244         srcs = []
1245         videos = smil.findall(self._xpath_ns('.//video', namespace))
1246         for video in videos:
1247             src = video.get('src')
1248             if not src or src in srcs:
1249                 continue
1250             srcs.append(src)
1251
1252             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1253             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1254             width = int_or_none(video.get('width'))
1255             height = int_or_none(video.get('height'))
1256             proto = video.get('proto')
1257             ext = video.get('ext')
1258             src_ext = determine_ext(src)
1259             streamer = video.get('streamer') or base
1260
1261             if proto == 'rtmp' or streamer.startswith('rtmp'):
1262                 rtmp_count += 1
1263                 formats.append({
1264                     'url': streamer,
1265                     'play_path': src,
1266                     'ext': 'flv',
1267                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1268                     'tbr': bitrate,
1269                     'filesize': filesize,
1270                     'width': width,
1271                     'height': height,
1272                 })
1273                 if transform_rtmp_url:
1274                     streamer, src = transform_rtmp_url(streamer, src)
1275                     formats[-1].update({
1276                         'url': streamer,
1277                         'play_path': src,
1278                     })
1279                 continue
1280
1281             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1282             src_url = src_url.strip()
1283
1284             if proto == 'm3u8' or src_ext == 'm3u8':
1285                 m3u8_formats = self._extract_m3u8_formats(
1286                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1287                 if len(m3u8_formats) == 1:
1288                     m3u8_count += 1
1289                     m3u8_formats[0].update({
1290                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1291                         'tbr': bitrate,
1292                         'width': width,
1293                         'height': height,
1294                     })
1295                 formats.extend(m3u8_formats)
1296                 continue
1297
1298             if src_ext == 'f4m':
1299                 f4m_url = src_url
1300                 if not f4m_params:
1301                     f4m_params = {
1302                         'hdcore': '3.2.0',
1303                         'plugin': 'flowplayer-3.2.0.1',
1304                     }
1305                 f4m_url += '&' if '?' in f4m_url else '?'
1306                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1307                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1308                 continue
1309
1310             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1311                 http_count += 1
1312                 formats.append({
1313                     'url': src_url,
1314                     'ext': ext or src_ext or 'flv',
1315                     'format_id': 'http-%d' % (bitrate or http_count),
1316                     'tbr': bitrate,
1317                     'filesize': filesize,
1318                     'width': width,
1319                     'height': height,
1320                 })
1321                 continue
1322
1323         return formats
1324
1325     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1326         urls = []
1327         subtitles = {}
1328         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1329             src = textstream.get('src')
1330             if not src or src in urls:
1331                 continue
1332             urls.append(src)
1333             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1334             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1335             subtitles.setdefault(lang, []).append({
1336                 'url': src,
1337                 'ext': ext,
1338             })
1339         return subtitles
1340
1341     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1342         xspf = self._download_xml(
1343             playlist_url, playlist_id, 'Downloading xpsf playlist',
1344             'Unable to download xspf manifest', fatal=fatal)
1345         if xspf is False:
1346             return []
1347         return self._parse_xspf(xspf, playlist_id)
1348
1349     def _parse_xspf(self, playlist, playlist_id):
1350         NS_MAP = {
1351             'xspf': 'http://xspf.org/ns/0/',
1352             's1': 'http://static.streamone.nl/player/ns/0',
1353         }
1354
1355         entries = []
1356         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1357             title = xpath_text(
1358                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1359             description = xpath_text(
1360                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1361             thumbnail = xpath_text(
1362                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1363             duration = float_or_none(
1364                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1365
1366             formats = [{
1367                 'url': location.text,
1368                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1369                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1370                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1371             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1372             self._sort_formats(formats)
1373
1374             entries.append({
1375                 'id': playlist_id,
1376                 'title': title,
1377                 'description': description,
1378                 'thumbnail': thumbnail,
1379                 'duration': duration,
1380                 'formats': formats,
1381             })
1382         return entries
1383
1384     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1385         res = self._download_webpage_handle(
1386             mpd_url, video_id,
1387             note=note or 'Downloading MPD manifest',
1388             errnote=errnote or 'Failed to download MPD manifest',
1389             fatal=fatal)
1390         if res is False:
1391             return []
1392         mpd, urlh = res
1393         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1394
1395         return self._parse_mpd_formats(
1396             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1397
1398     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1399         if mpd_doc.get('type') == 'dynamic':
1400             return []
1401
1402         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1403
1404         def _add_ns(path):
1405             return self._xpath_ns(path, namespace)
1406
1407         def is_drm_protected(element):
1408             return element.find(_add_ns('ContentProtection')) is not None
1409
1410         def extract_multisegment_info(element, ms_parent_info):
1411             ms_info = ms_parent_info.copy()
1412             segment_list = element.find(_add_ns('SegmentList'))
1413             if segment_list is not None:
1414                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1415                 if segment_urls_e:
1416                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1417                 initialization = segment_list.find(_add_ns('Initialization'))
1418                 if initialization is not None:
1419                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1420             else:
1421                 segment_template = element.find(_add_ns('SegmentTemplate'))
1422                 if segment_template is not None:
1423                     start_number = segment_template.get('startNumber')
1424                     if start_number:
1425                         ms_info['start_number'] = int(start_number)
1426                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1427                     if segment_timeline is not None:
1428                         s_e = segment_timeline.findall(_add_ns('S'))
1429                         if s_e:
1430                             ms_info['total_number'] = 0
1431                             for s in s_e:
1432                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1433                     else:
1434                         timescale = segment_template.get('timescale')
1435                         if timescale:
1436                             ms_info['timescale'] = int(timescale)
1437                         segment_duration = segment_template.get('duration')
1438                         if segment_duration:
1439                             ms_info['segment_duration'] = int(segment_duration)
1440                     media_template = segment_template.get('media')
1441                     if media_template:
1442                         ms_info['media_template'] = media_template
1443                     initialization = segment_template.get('initialization')
1444                     if initialization:
1445                         ms_info['initialization_url'] = initialization
1446                     else:
1447                         initialization = segment_template.find(_add_ns('Initialization'))
1448                         if initialization is not None:
1449                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1450             return ms_info
1451
1452         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1453         formats = []
1454         for period in mpd_doc.findall(_add_ns('Period')):
1455             period_duration = parse_duration(period.get('duration')) or mpd_duration
1456             period_ms_info = extract_multisegment_info(period, {
1457                 'start_number': 1,
1458                 'timescale': 1,
1459             })
1460             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1461                 if is_drm_protected(adaptation_set):
1462                     continue
1463                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1464                 for representation in adaptation_set.findall(_add_ns('Representation')):
1465                     if is_drm_protected(representation):
1466                         continue
1467                     representation_attrib = adaptation_set.attrib.copy()
1468                     representation_attrib.update(representation.attrib)
1469                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1470                     mime_type = representation_attrib['mimeType']
1471                     content_type = mime_type.split('/')[0]
1472                     if content_type == 'text':
1473                         # TODO implement WebVTT downloading
1474                         pass
1475                     elif content_type == 'video' or content_type == 'audio':
1476                         base_url = ''
1477                         for element in (representation, adaptation_set, period, mpd_doc):
1478                             base_url_e = element.find(_add_ns('BaseURL'))
1479                             if base_url_e is not None:
1480                                 base_url = base_url_e.text + base_url
1481                                 if re.match(r'^https?://', base_url):
1482                                     break
1483                         if mpd_base_url and not re.match(r'^https?://', base_url):
1484                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1485                                 mpd_base_url += '/'
1486                             base_url = mpd_base_url + base_url
1487                         representation_id = representation_attrib.get('id')
1488                         lang = representation_attrib.get('lang')
1489                         url_el = representation.find(_add_ns('BaseURL'))
1490                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1491                         f = {
1492                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1493                             'url': base_url,
1494                             'ext': mimetype2ext(mime_type),
1495                             'width': int_or_none(representation_attrib.get('width')),
1496                             'height': int_or_none(representation_attrib.get('height')),
1497                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1498                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1499                             'fps': int_or_none(representation_attrib.get('frameRate')),
1500                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1501                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1502                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1503                             'format_note': 'DASH %s' % content_type,
1504                             'filesize': filesize,
1505                         }
1506                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1507                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1508                             if 'total_number' not in representation_ms_info and 'segment_duration':
1509                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1510                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1511                             media_template = representation_ms_info['media_template']
1512                             media_template = media_template.replace('$RepresentationID$', representation_id)
1513                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1514                             media_template.replace('$$', '$')
1515                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1516                         if 'segment_urls' in representation_ms_info:
1517                             f.update({
1518                                 'segment_urls': representation_ms_info['segment_urls'],
1519                                 'protocol': 'http_dash_segments',
1520                             })
1521                             if 'initialization_url' in representation_ms_info:
1522                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1523                                 f.update({
1524                                     'initialization_url': initialization_url,
1525                                 })
1526                                 if not f.get('url'):
1527                                     f['url'] = initialization_url
1528                         try:
1529                             existing_format = next(
1530                                 fo for fo in formats
1531                                 if fo['format_id'] == representation_id)
1532                         except StopIteration:
1533                             full_info = formats_dict.get(representation_id, {}).copy()
1534                             full_info.update(f)
1535                             formats.append(full_info)
1536                         else:
1537                             existing_format.update(f)
1538                     else:
1539                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1540         return formats
1541
1542     def _live_title(self, name):
1543         """ Generate the title for a live video """
1544         now = datetime.datetime.now()
1545         now_str = now.strftime('%Y-%m-%d %H:%M')
1546         return name + ' ' + now_str
1547
1548     def _int(self, v, name, fatal=False, **kwargs):
1549         res = int_or_none(v, **kwargs)
1550         if 'get_attr' in kwargs:
1551             print(getattr(v, kwargs['get_attr']))
1552         if res is None:
1553             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1554             if fatal:
1555                 raise ExtractorError(msg)
1556             else:
1557                 self._downloader.report_warning(msg)
1558         return res
1559
1560     def _float(self, v, name, fatal=False, **kwargs):
1561         res = float_or_none(v, **kwargs)
1562         if res is None:
1563             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1564             if fatal:
1565                 raise ExtractorError(msg)
1566             else:
1567                 self._downloader.report_warning(msg)
1568         return res
1569
1570     def _set_cookie(self, domain, name, value, expire_time=None):
1571         cookie = compat_cookiejar.Cookie(
1572             0, name, value, None, None, domain, None,
1573             None, '/', True, False, expire_time, '', None, None, None)
1574         self._downloader.cookiejar.set_cookie(cookie)
1575
1576     def _get_cookies(self, url):
1577         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1578         req = sanitized_Request(url)
1579         self._downloader.cookiejar.add_cookie_header(req)
1580         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1581
1582     def get_testcases(self, include_onlymatching=False):
1583         t = getattr(self, '_TEST', None)
1584         if t:
1585             assert not hasattr(self, '_TESTS'), \
1586                 '%s has _TEST and _TESTS' % type(self).__name__
1587             tests = [t]
1588         else:
1589             tests = getattr(self, '_TESTS', [])
1590         for t in tests:
1591             if not include_onlymatching and t.get('only_matching', False):
1592                 continue
1593             t['name'] = type(self).__name__[:-len('IE')]
1594             yield t
1595
1596     def is_suitable(self, age_limit):
1597         """ Test whether the extractor is generally suitable for the given
1598         age limit (i.e. pornographic sites are not, all others usually are) """
1599
1600         any_restricted = False
1601         for tc in self.get_testcases(include_onlymatching=False):
1602             if 'playlist' in tc:
1603                 tc = tc['playlist'][0]
1604             is_restricted = age_restricted(
1605                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1606             if not is_restricted:
1607                 return True
1608             any_restricted = any_restricted or is_restricted
1609         return not any_restricted
1610
1611     def extract_subtitles(self, *args, **kwargs):
1612         if (self._downloader.params.get('writesubtitles', False) or
1613                 self._downloader.params.get('listsubtitles')):
1614             return self._get_subtitles(*args, **kwargs)
1615         return {}
1616
1617     def _get_subtitles(self, *args, **kwargs):
1618         raise NotImplementedError('This method must be implemented by subclasses')
1619
1620     @staticmethod
1621     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1622         """ Merge subtitle items for one language. Items with duplicated URLs
1623         will be dropped. """
1624         list1_urls = set([item['url'] for item in subtitle_list1])
1625         ret = list(subtitle_list1)
1626         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1627         return ret
1628
1629     @classmethod
1630     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1631         """ Merge two subtitle dictionaries, language by language. """
1632         ret = dict(subtitle_dict1)
1633         for lang in subtitle_dict2:
1634             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1635         return ret
1636
1637     def extract_automatic_captions(self, *args, **kwargs):
1638         if (self._downloader.params.get('writeautomaticsub', False) or
1639                 self._downloader.params.get('listsubtitles')):
1640             return self._get_automatic_captions(*args, **kwargs)
1641         return {}
1642
1643     def _get_automatic_captions(self, *args, **kwargs):
1644         raise NotImplementedError('This method must be implemented by subclasses')
1645
1646     def mark_watched(self, *args, **kwargs):
1647         if (self._downloader.params.get('mark_watched', False) and
1648                 (self._get_login_info()[0] is not None or
1649                     self._downloader.params.get('cookiefile') is not None)):
1650             self._mark_watched(*args, **kwargs)
1651
1652     def _mark_watched(self, *args, **kwargs):
1653         raise NotImplementedError('This method must be implemented by subclasses')
1654
1655
1656 class SearchInfoExtractor(InfoExtractor):
1657     """
1658     Base class for paged search queries extractors.
1659     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1660     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1661     """
1662
1663     @classmethod
1664     def _make_valid_url(cls):
1665         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1666
1667     @classmethod
1668     def suitable(cls, url):
1669         return re.match(cls._make_valid_url(), url) is not None
1670
1671     def _real_extract(self, query):
1672         mobj = re.match(self._make_valid_url(), query)
1673         if mobj is None:
1674             raise ExtractorError('Invalid search query "%s"' % query)
1675
1676         prefix = mobj.group('prefix')
1677         query = mobj.group('query')
1678         if prefix == '':
1679             return self._get_n_results(query, 1)
1680         elif prefix == 'all':
1681             return self._get_n_results(query, self._MAX_RESULTS)
1682         else:
1683             n = int(prefix)
1684             if n <= 0:
1685                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1686             elif n > self._MAX_RESULTS:
1687                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1688                 n = self._MAX_RESULTS
1689             return self._get_n_results(query, n)
1690
1691     def _get_n_results(self, query, n):
1692         """Get a specified number of results for a query"""
1693         raise NotImplementedError('This method must be implemented by subclasses')
1694
1695     @property
1696     def SEARCH_KEY(self):
1697         return self._SEARCH_KEY