_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     url_basename,
  48     xpath_text,
  49     xpath_with_ns,
  50     determine_protocol,
  51     parse_duration,
  52     mimetype2ext,
  53     update_Request,
  54     update_url_query,
  55 )
  56
  57
  58 class InfoExtractor(object):
  59     """Information Extractor class.
  60
  61     Information extractors are the classes that, given a URL, extract
  62     information about the video (or videos) the URL refers to. This
  63     information includes the real video URL, the video title, author and
  64     others. The information is stored in a dictionary which is then
  65     passed to the YoutubeDL. The YoutubeDL processes this
  66     information possibly downloading the video to the file system, among
  67     other possible outcomes.
  68
  69     The type field determines the type of the result.
  70     By far the most common value (and the default if _type is missing) is
  71     "video", which indicates a single video.
  72
  73     For a video, the dictionaries must include the following fields:
  74
  75     id:             Video identifier.
  76     title:          Video title, unescaped.
  77
  78     Additionally, it must contain either a formats entry or a url one:
  79
  80     formats:        A list of dictionaries for each format available, ordered
  81                     from worst to best quality.
  82
  83                     Potential fields:
  84                     * url        Mandatory. The URL of the video file
  85                     * ext        Will be calculated from URL if missing
  86                     * format     A human-readable description of the format
  87                                  ("mp4 container with h264/opus").
  88                                  Calculated from the format_id, width, height.
  89                                  and format_note fields if missing.
  90                     * format_id  A short description of the format
  91                                  ("mp4_h264_opus" or "19").
  92                                 Technically optional, but strongly recommended.
  93                     * format_note Additional info about the format
  94                                  ("3D" or "DASH video")
  95                     * width      Width of the video, if known
  96                     * height     Height of the video, if known
  97                     * resolution Textual description of width and height
  98                     * tbr        Average bitrate of audio and video in KBit/s
  99                     * abr        Average audio bitrate in KBit/s
 100                     * acodec     Name of the audio codec in use
 101                     * asr        Audio sampling rate in Hertz
 102                     * vbr        Average video bitrate in KBit/s
 103                     * fps        Frame rate
 104                     * vcodec     Name of the video codec in use
 105                     * container  Name of the container format
 106                     * filesize   The number of bytes, if known in advance
 107                     * filesize_approx  An estimate for the number of bytes
 108                     * player_url SWF Player URL (used for rtmpdump).
 109                     * protocol   The protocol that will be used for the actual
 110                                  download, lower-case.
 111                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 112                                  "m3u8", "m3u8_native" or "http_dash_segments".
 113                     * preference Order number of this format. If this field is
 114                                  present and not None, the formats get sorted
 115                                  by this field, regardless of all other values.
 116                                  -1 for default (order by other properties),
 117                                  -2 or smaller for less than default.
 118                                  < -1000 to hide the format (if there is
 119                                     another one which is strictly better)
 120                     * language   Language code, e.g. "de" or "en-US".
 121                     * language_preference  Is this in the language mentioned in
 122                                  the URL?
 123                                  10 if it's what the URL is about,
 124                                  -1 for default (don't know),
 125                                  -10 otherwise, other values reserved for now.
 126                     * quality    Order number of the video quality of this
 127                                  format, irrespective of the file format.
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                     * source_preference  Order number for this video source
 131                                   (quality takes higher priority)
 132                                  -1 for default (order by other properties),
 133                                  -2 or smaller for less than default.
 134                     * http_headers  A dictionary of additional HTTP headers
 135                                  to add to the request.
 136                     * stretched_ratio  If given and not 1, indicates that the
 137                                  video's pixels are not square.
 138                                  width : height ratio as float.
 139                     * no_resume  The server does not support resuming the
 140                                  (HTTP or RTMP) download. Boolean.
 141
 142     url:            Final video URL.
 143     ext:            Video filename extension.
 144     format:         The video format, defaults to ext (used for --get-format)
 145     player_url:     SWF Player URL (used for rtmpdump).
 146
 147     The following fields are optional:
 148
 149     alt_title:      A secondary title of the video.
 150     display_id      An alternative identifier for the video, not necessarily
 151                     unique, but available before title. Typically, id is
 152                     something like "4234987", title "Dancing naked mole rats",
 153                     and display_id "dancing-naked-mole-rats"
 154     thumbnails:     A list of dictionaries, with the following entries:
 155                         * "id" (optional, string) - Thumbnail format ID
 156                         * "url"
 157                         * "preference" (optional, int) - quality of the image
 158                         * "width" (optional, int)
 159                         * "height" (optional, int)
 160                         * "resolution" (optional, string "{width}x{height"},
 161                                         deprecated)
 162     thumbnail:      Full URL to a video thumbnail image.
 163     description:    Full video description.
 164     uploader:       Full name of the video uploader.
 165     license:        License name the video is licensed under.
 166     creator:        The main artist who created the video.
 167     release_date:   The date (YYYYMMDD) when the video was released.
 168     timestamp:      UNIX timestamp of the moment the video became available.
 169     upload_date:    Video upload date (YYYYMMDD).
 170                     If not explicitly set, calculated from timestamp.
 171     uploader_id:    Nickname or id of the video uploader.
 172     uploader_url:   Full URL to a personal webpage of the video uploader.
 173     location:       Physical location where the video was filmed.
 174     subtitles:      The available subtitles as a dictionary in the format
 175                     {language: subformats}. "subformats" is a list sorted from
 176                     lower to higher preference, each element is a dictionary
 177                     with the "ext" entry and one of:
 178                         * "data": The subtitles file contents
 179                         * "url": A URL pointing to the subtitles file
 180                     "ext" will be calculated from URL if missing
 181     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 182                     automatically generated captions
 183     duration:       Length of the video in seconds, as an integer or float.
 184     view_count:     How many users have watched the video on the platform.
 185     like_count:     Number of positive ratings of the video
 186     dislike_count:  Number of negative ratings of the video
 187     repost_count:   Number of reposts of the video
 188     average_rating: Average rating give by users, the scale used depends on the webpage
 189     comment_count:  Number of comments on the video
 190     comments:       A list of comments, each with one or more of the following
 191                     properties (all but one of text or html optional):
 192                         * "author" - human-readable name of the comment author
 193                         * "author_id" - user ID of the comment author
 194                         * "id" - Comment ID
 195                         * "html" - Comment as HTML
 196                         * "text" - Plain text of the comment
 197                         * "timestamp" - UNIX timestamp of comment
 198                         * "parent" - ID of the comment this one is replying to.
 199                                      Set to "root" to indicate that this is a
 200                                      comment to the original video.
 201     age_limit:      Age restriction for the video, as an integer (years)
 202     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 203                     should allow to get the same result again. (It will be set
 204                     by YoutubeDL if it's missing)
 205     categories:     A list of categories that the video falls in, for example
 206                     ["Sports", "Berlin"]
 207     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 208     is_live:        True, False, or None (=unknown). Whether this video is a
 209                     live stream that goes on instead of a fixed-length video.
 210     start_time:     Time in seconds where the reproduction should start, as
 211                     specified in the URL.
 212     end_time:       Time in seconds where the reproduction should end, as
 213                     specified in the URL.
 214
 215     The following fields should only be used when the video belongs to some logical
 216     chapter or section:
 217
 218     chapter:        Name or title of the chapter the video belongs to.
 219     chapter_number: Number of the chapter the video belongs to, as an integer.
 220     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 221
 222     The following fields should only be used when the video is an episode of some
 223     series or programme:
 224
 225     series:         Title of the series or programme the video episode belongs to.
 226     season:         Title of the season the video episode belongs to.
 227     season_number:  Number of the season the video episode belongs to, as an integer.
 228     season_id:      Id of the season the video episode belongs to, as a unicode string.
 229     episode:        Title of the video episode. Unlike mandatory video title field,
 230                     this field should denote the exact title of the video episode
 231                     without any kind of decoration.
 232     episode_number: Number of the video episode within a season, as an integer.
 233     episode_id:     Id of the video episode, as a unicode string.
 234
 235     Unless mentioned otherwise, the fields should be Unicode strings.
 236
 237     Unless mentioned otherwise, None is equivalent to absence of information.
 238
 239
 240     _type "playlist" indicates multiple videos.
 241     There must be a key "entries", which is a list, an iterable, or a PagedList
 242     object, each element of which is a valid dictionary by this specification.
 243
 244     Additionally, playlists can have "title", "description" and "id" attributes
 245     with the same semantics as videos (see above).
 246
 247
 248     _type "multi_video" indicates that there are multiple videos that
 249     form a single show, for examples multiple acts of an opera or TV episode.
 250     It must have an entries key like a playlist and contain all the keys
 251     required for a video at the same time.
 252
 253
 254     _type "url" indicates that the video must be extracted from another
 255     location, possibly by a different extractor. Its only required key is:
 256     "url" - the next URL to extract.
 257     The key "ie_key" can be set to the class name (minus the trailing "IE",
 258     e.g. "Youtube") if the extractor class is known in advance.
 259     Additionally, the dictionary may have any properties of the resolved entity
 260     known in advance, for example "title" if the title of the referred video is
 261     known ahead of time.
 262
 263
 264     _type "url_transparent" entities have the same specification as "url", but
 265     indicate that the given additional information is more precise than the one
 266     associated with the resolved URL.
 267     This is useful when a site employs a video service that hosts the video and
 268     its technical metadata, but that video service does not embed a useful
 269     title, description etc.
 270
 271
 272     Subclasses of this one should re-define the _real_initialize() and
 273     _real_extract() methods and define a _VALID_URL regexp.
 274     Probably, they should also be added to the list of extractors.
 275
 276     Finally, the _WORKING attribute should be set to False for broken IEs
 277     in order to warn the users and skip the tests.
 278     """
 279
 280     _ready = False
 281     _downloader = None
 282     _WORKING = True
 283
 284     def __init__(self, downloader=None):
 285         """Constructor. Receives an optional downloader."""
 286         self._ready = False
 287         self.set_downloader(downloader)
 288
 289     @classmethod
 290     def suitable(cls, url):
 291         """Receives a URL and returns True if suitable for this IE."""
 292
 293         # This does not use has/getattr intentionally - we want to know whether
 294         # we have cached the regexp for *this* class, whereas getattr would also
 295         # match the superclass
 296         if '_VALID_URL_RE' not in cls.__dict__:
 297             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 298         return cls._VALID_URL_RE.match(url) is not None
 299
 300     @classmethod
 301     def _match_id(cls, url):
 302         if '_VALID_URL_RE' not in cls.__dict__:
 303             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 304         m = cls._VALID_URL_RE.match(url)
 305         assert m
 306         return m.group('id')
 307
 308     @classmethod
 309     def working(cls):
 310         """Getter method for _WORKING."""
 311         return cls._WORKING
 312
 313     def initialize(self):
 314         """Initializes an instance (authentication, etc)."""
 315         if not self._ready:
 316             self._real_initialize()
 317             self._ready = True
 318
 319     def extract(self, url):
 320         """Extracts URL information and returns it in list of dicts."""
 321         try:
 322             self.initialize()
 323             return self._real_extract(url)
 324         except ExtractorError:
 325             raise
 326         except compat_http_client.IncompleteRead as e:
 327             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 328         except (KeyError, StopIteration) as e:
 329             raise ExtractorError('An extractor error has occurred.', cause=e)
 330
 331     def set_downloader(self, downloader):
 332         """Sets the downloader for this IE."""
 333         self._downloader = downloader
 334
 335     def _real_initialize(self):
 336         """Real initialization process. Redefine in subclasses."""
 337         pass
 338
 339     def _real_extract(self, url):
 340         """Real extraction process. Redefine in subclasses."""
 341         pass
 342
 343     @classmethod
 344     def ie_key(cls):
 345         """A string for getting the InfoExtractor with get_info_extractor"""
 346         return compat_str(cls.__name__[:-2])
 347
 348     @property
 349     def IE_NAME(self):
 350         return compat_str(type(self).__name__[:-2])
 351
 352     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 353         """ Returns the response handle """
 354         if note is None:
 355             self.report_download_webpage(video_id)
 356         elif note is not False:
 357             if video_id is None:
 358                 self.to_screen('%s' % (note,))
 359             else:
 360                 self.to_screen('%s: %s' % (video_id, note))
 361         # data, headers and query params will be ignored for `Request` objects
 362         if isinstance(url_or_request, compat_urllib_request.Request):
 363             url_or_request = update_Request(
 364                 url_or_request, data=data, headers=headers, query=query)
 365         else:
 366             if query:
 367                 url_or_request = update_url_query(url_or_request, query)
 368             if data or headers:
 369                 url_or_request = sanitized_Request(url_or_request, data, headers)
 370         try:
 371             return self._downloader.urlopen(url_or_request)
 372         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 373             if errnote is False:
 374                 return False
 375             if errnote is None:
 376                 errnote = 'Unable to download webpage'
 377
 378             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 379             if fatal:
 380                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 381             else:
 382                 self._downloader.report_warning(errmsg)
 383                 return False
 384
 385     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 386         """ Returns a tuple (page content as string, URL handle) """
 387         # Strip hashes from the URL (#1038)
 388         if isinstance(url_or_request, (compat_str, str)):
 389             url_or_request = url_or_request.partition('#')[0]
 390
 391         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 392         if urlh is False:
 393             assert not fatal
 394             return False
 395         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 396         return (content, urlh)
 397
 398     @staticmethod
 399     def _guess_encoding_from_content(content_type, webpage_bytes):
 400         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 401         if m:
 402             encoding = m.group(1)
 403         else:
 404             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 405                           webpage_bytes[:1024])
 406             if m:
 407                 encoding = m.group(1).decode('ascii')
 408             elif webpage_bytes.startswith(b'\xff\xfe'):
 409                 encoding = 'utf-16'
 410             else:
 411                 encoding = 'utf-8'
 412
 413         return encoding
 414
 415     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 416         content_type = urlh.headers.get('Content-Type', '')
 417         webpage_bytes = urlh.read()
 418         if prefix is not None:
 419             webpage_bytes = prefix + webpage_bytes
 420         if not encoding:
 421             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 422         if self._downloader.params.get('dump_intermediate_pages', False):
 423             try:
 424                 url = url_or_request.get_full_url()
 425             except AttributeError:
 426                 url = url_or_request
 427             self.to_screen('Dumping request to ' + url)
 428             dump = base64.b64encode(webpage_bytes).decode('ascii')
 429             self._downloader.to_screen(dump)
 430         if self._downloader.params.get('write_pages', False):
 431             try:
 432                 url = url_or_request.get_full_url()
 433             except AttributeError:
 434                 url = url_or_request
 435             basen = '%s_%s' % (video_id, url)
 436             if len(basen) > 240:
 437                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 438                 basen = basen[:240 - len(h)] + h
 439             raw_filename = basen + '.dump'
 440             filename = sanitize_filename(raw_filename, restricted=True)
 441             self.to_screen('Saving request to ' + filename)
 442             # Working around MAX_PATH limitation on Windows (see
 443             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 444             if compat_os_name == 'nt':
 445                 absfilepath = os.path.abspath(filename)
 446                 if len(absfilepath) > 259:
 447                     filename = '\\\\?\\' + absfilepath
 448             with open(filename, 'wb') as outf:
 449                 outf.write(webpage_bytes)
 450
 451         try:
 452             content = webpage_bytes.decode(encoding, 'replace')
 453         except LookupError:
 454             content = webpage_bytes.decode('utf-8', 'replace')
 455
 456         if ('<title>Access to this site is blocked</title>' in content and
 457                 'Websense' in content[:512]):
 458             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 459             blocked_iframe = self._html_search_regex(
 460                 r'<iframe src="([^"]+)"', content,
 461                 'Websense information URL', default=None)
 462             if blocked_iframe:
 463                 msg += ' Visit %s for more details' % blocked_iframe
 464             raise ExtractorError(msg, expected=True)
 465         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 466             msg = (
 467                 'Access to this webpage has been blocked by Indian censorship. '
 468                 'Use a VPN or proxy server (with --proxy) to route around it.')
 469             block_msg = self._html_search_regex(
 470                 r'</h1><p>(.*?)</p>',
 471                 content, 'block message', default=None)
 472             if block_msg:
 473                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 474             raise ExtractorError(msg, expected=True)
 475
 476         return content
 477
 478     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 479         """ Returns the data of the page as a string """
 480         success = False
 481         try_count = 0
 482         while success is False:
 483             try:
 484                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 485                 success = True
 486             except compat_http_client.IncompleteRead as e:
 487                 try_count += 1
 488                 if try_count >= tries:
 489                     raise e
 490                 self._sleep(timeout, video_id)
 491         if res is False:
 492             return res
 493         else:
 494             content, _ = res
 495             return content
 496
 497     def _download_xml(self, url_or_request, video_id,
 498                       note='Downloading XML', errnote='Unable to download XML',
 499                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 500         """Return the xml as an xml.etree.ElementTree.Element"""
 501         xml_string = self._download_webpage(
 502             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 503         if xml_string is False:
 504             return xml_string
 505         if transform_source:
 506             xml_string = transform_source(xml_string)
 507         return compat_etree_fromstring(xml_string.encode('utf-8'))
 508
 509     def _download_json(self, url_or_request, video_id,
 510                        note='Downloading JSON metadata',
 511                        errnote='Unable to download JSON metadata',
 512                        transform_source=None,
 513                        fatal=True, encoding=None, data=None, headers={}, query={}):
 514         json_string = self._download_webpage(
 515             url_or_request, video_id, note, errnote, fatal=fatal,
 516             encoding=encoding, data=data, headers=headers, query=query)
 517         if (not fatal) and json_string is False:
 518             return None
 519         return self._parse_json(
 520             json_string, video_id, transform_source=transform_source, fatal=fatal)
 521
 522     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 523         if transform_source:
 524             json_string = transform_source(json_string)
 525         try:
 526             return json.loads(json_string)
 527         except ValueError as ve:
 528             errmsg = '%s: Failed to parse JSON ' % video_id
 529             if fatal:
 530                 raise ExtractorError(errmsg, cause=ve)
 531             else:
 532                 self.report_warning(errmsg + str(ve))
 533
 534     def report_warning(self, msg, video_id=None):
 535         idstr = '' if video_id is None else '%s: ' % video_id
 536         self._downloader.report_warning(
 537             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 538
 539     def to_screen(self, msg):
 540         """Print msg to screen, prefixing it with '[ie_name]'"""
 541         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 542
 543     def report_extraction(self, id_or_name):
 544         """Report information extraction."""
 545         self.to_screen('%s: Extracting information' % id_or_name)
 546
 547     def report_download_webpage(self, video_id):
 548         """Report webpage download."""
 549         self.to_screen('%s: Downloading webpage' % video_id)
 550
 551     def report_age_confirmation(self):
 552         """Report attempt to confirm age."""
 553         self.to_screen('Confirming age')
 554
 555     def report_login(self):
 556         """Report attempt to log in."""
 557         self.to_screen('Logging in')
 558
 559     @staticmethod
 560     def raise_login_required(msg='This video is only available for registered users'):
 561         raise ExtractorError(
 562             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 563             expected=True)
 564
 565     @staticmethod
 566     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 567         raise ExtractorError(
 568             '%s. You might want to use --proxy to workaround.' % msg,
 569             expected=True)
 570
 571     # Methods for following #608
 572     @staticmethod
 573     def url_result(url, ie=None, video_id=None, video_title=None):
 574         """Returns a URL that points to a page that should be processed"""
 575         # TODO: ie should be the class used for getting the info
 576         video_info = {'_type': 'url',
 577                       'url': url,
 578                       'ie_key': ie}
 579         if video_id is not None:
 580             video_info['id'] = video_id
 581         if video_title is not None:
 582             video_info['title'] = video_title
 583         return video_info
 584
 585     @staticmethod
 586     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 587         """Returns a playlist"""
 588         video_info = {'_type': 'playlist',
 589                       'entries': entries}
 590         if playlist_id:
 591             video_info['id'] = playlist_id
 592         if playlist_title:
 593             video_info['title'] = playlist_title
 594         if playlist_description:
 595             video_info['description'] = playlist_description
 596         return video_info
 597
 598     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 599         """
 600         Perform a regex search on the given string, using a single or a list of
 601         patterns returning the first matching group.
 602         In case of failure return a default value or raise a WARNING or a
 603         RegexNotFoundError, depending on fatal, specifying the field name.
 604         """
 605         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 606             mobj = re.search(pattern, string, flags)
 607         else:
 608             for p in pattern:
 609                 mobj = re.search(p, string, flags)
 610                 if mobj:
 611                     break
 612
 613         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 614             _name = '\033[0;34m%s\033[0m' % name
 615         else:
 616             _name = name
 617
 618         if mobj:
 619             if group is None:
 620                 # return the first matching group
 621                 return next(g for g in mobj.groups() if g is not None)
 622             else:
 623                 return mobj.group(group)
 624         elif default is not NO_DEFAULT:
 625             return default
 626         elif fatal:
 627             raise RegexNotFoundError('Unable to extract %s' % _name)
 628         else:
 629             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 630             return None
 631
 632     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 633         """
 634         Like _search_regex, but strips HTML tags and unescapes entities.
 635         """
 636         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 637         if res:
 638             return clean_html(res).strip()
 639         else:
 640             return res
 641
 642     def _get_login_info(self):
 643         """
 644         Get the login info as (username, password)
 645         It will look in the netrc file using the _NETRC_MACHINE value
 646         If there's no info available, return (None, None)
 647         """
 648         if self._downloader is None:
 649             return (None, None)
 650
 651         username = None
 652         password = None
 653         downloader_params = self._downloader.params
 654
 655         # Attempt to use provided username and password or .netrc data
 656         if downloader_params.get('username') is not None:
 657             username = downloader_params['username']
 658             password = downloader_params['password']
 659         elif downloader_params.get('usenetrc', False):
 660             try:
 661                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 662                 if info is not None:
 663                     username = info[0]
 664                     password = info[2]
 665                 else:
 666                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 667             except (IOError, netrc.NetrcParseError) as err:
 668                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 669
 670         return (username, password)
 671
 672     def _get_tfa_info(self, note='two-factor verification code'):
 673         """
 674         Get the two-factor authentication info
 675         TODO - asking the user will be required for sms/phone verify
 676         currently just uses the command line option
 677         If there's no info available, return None
 678         """
 679         if self._downloader is None:
 680             return None
 681         downloader_params = self._downloader.params
 682
 683         if downloader_params.get('twofactor') is not None:
 684             return downloader_params['twofactor']
 685
 686         return compat_getpass('Type %s and press [Return]: ' % note)
 687
 688     # Helper functions for extracting OpenGraph info
 689     @staticmethod
 690     def _og_regexes(prop):
 691         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 692         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 693                        % {'prop': re.escape(prop)})
 694         template = r'<meta[^>]+?%s[^>]+?%s'
 695         return [
 696             template % (property_re, content_re),
 697             template % (content_re, property_re),
 698         ]
 699
 700     @staticmethod
 701     def _meta_regex(prop):
 702         return r'''(?isx)<meta
 703                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 704                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 705
 706     def _og_search_property(self, prop, html, name=None, **kargs):
 707         if name is None:
 708             name = 'OpenGraph %s' % prop
 709         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 710         if escaped is None:
 711             return None
 712         return unescapeHTML(escaped)
 713
 714     def _og_search_thumbnail(self, html, **kargs):
 715         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 716
 717     def _og_search_description(self, html, **kargs):
 718         return self._og_search_property('description', html, fatal=False, **kargs)
 719
 720     def _og_search_title(self, html, **kargs):
 721         return self._og_search_property('title', html, **kargs)
 722
 723     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 724         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 725         if secure:
 726             regexes = self._og_regexes('video:secure_url') + regexes
 727         return self._html_search_regex(regexes, html, name, **kargs)
 728
 729     def _og_search_url(self, html, **kargs):
 730         return self._og_search_property('url', html, **kargs)
 731
 732     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 733         if display_name is None:
 734             display_name = name
 735         return self._html_search_regex(
 736             self._meta_regex(name),
 737             html, display_name, fatal=fatal, group='content', **kwargs)
 738
 739     def _dc_search_uploader(self, html):
 740         return self._html_search_meta('dc.creator', html, 'uploader')
 741
 742     def _rta_search(self, html):
 743         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 744         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 745                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 746                      html):
 747             return 18
 748         return 0
 749
 750     def _media_rating_search(self, html):
 751         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 752         rating = self._html_search_meta('rating', html)
 753
 754         if not rating:
 755             return None
 756
 757         RATING_TABLE = {
 758             'safe for kids': 0,
 759             'general': 8,
 760             '14 years': 14,
 761             'mature': 17,
 762             'restricted': 19,
 763         }
 764         return RATING_TABLE.get(rating.lower())
 765
 766     def _family_friendly_search(self, html):
 767         # See http://schema.org/VideoObject
 768         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 769
 770         if not family_friendly:
 771             return None
 772
 773         RATING_TABLE = {
 774             '1': 0,
 775             'true': 0,
 776             '0': 18,
 777             'false': 18,
 778         }
 779         return RATING_TABLE.get(family_friendly.lower())
 780
 781     def _twitter_search_player(self, html):
 782         return self._html_search_meta('twitter:player', html,
 783                                       'twitter card player')
 784
 785     def _search_json_ld(self, html, video_id, **kwargs):
 786         json_ld = self._search_regex(
 787             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 788             html, 'JSON-LD', group='json_ld', **kwargs)
 789         if not json_ld:
 790             return {}
 791         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 792
 793     def _json_ld(self, json_ld, video_id, fatal=True):
 794         if isinstance(json_ld, compat_str):
 795             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 796         if not json_ld:
 797             return {}
 798         info = {}
 799         if json_ld.get('@context') == 'http://schema.org':
 800             item_type = json_ld.get('@type')
 801             if item_type == 'TVEpisode':
 802                 info.update({
 803                     'episode': unescapeHTML(json_ld.get('name')),
 804                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 805                     'description': unescapeHTML(json_ld.get('description')),
 806                 })
 807                 part_of_season = json_ld.get('partOfSeason')
 808                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 809                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 810                 part_of_series = json_ld.get('partOfSeries')
 811                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 812                     info['series'] = unescapeHTML(part_of_series.get('name'))
 813             elif item_type == 'Article':
 814                 info.update({
 815                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 816                     'title': unescapeHTML(json_ld.get('headline')),
 817                     'description': unescapeHTML(json_ld.get('articleBody')),
 818                 })
 819         return dict((k, v) for k, v in info.items() if v is not None)
 820
 821     @staticmethod
 822     def _hidden_inputs(html):
 823         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 824         hidden_inputs = {}
 825         for input in re.findall(r'(?i)<input([^>]+)>', html):
 826             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 827                 continue
 828             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 829             if not name:
 830                 continue
 831             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 832             if not value:
 833                 continue
 834             hidden_inputs[name.group('value')] = value.group('value')
 835         return hidden_inputs
 836
 837     def _form_hidden_inputs(self, form_id, html):
 838         form = self._search_regex(
 839             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 840             html, '%s form' % form_id, group='form')
 841         return self._hidden_inputs(form)
 842
 843     def _sort_formats(self, formats, field_preference=None):
 844         if not formats:
 845             raise ExtractorError('No video formats found')
 846
 847         for f in formats:
 848             # Automatically determine tbr when missing based on abr and vbr (improves
 849             # formats sorting in some cases)
 850             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 851                 f['tbr'] = f['abr'] + f['vbr']
 852
 853         def _formats_key(f):
 854             # TODO remove the following workaround
 855             from ..utils import determine_ext
 856             if not f.get('ext') and 'url' in f:
 857                 f['ext'] = determine_ext(f['url'])
 858
 859             if isinstance(field_preference, (list, tuple)):
 860                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 861
 862             preference = f.get('preference')
 863             if preference is None:
 864                 preference = 0
 865                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 866                     preference -= 0.5
 867
 868             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 869
 870             if f.get('vcodec') == 'none':  # audio only
 871                 preference -= 50
 872                 if self._downloader.params.get('prefer_free_formats'):
 873                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 874                 else:
 875                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 876                 ext_preference = 0
 877                 try:
 878                     audio_ext_preference = ORDER.index(f['ext'])
 879                 except ValueError:
 880                     audio_ext_preference = -1
 881             else:
 882                 if f.get('acodec') == 'none':  # video only
 883                     preference -= 40
 884                 if self._downloader.params.get('prefer_free_formats'):
 885                     ORDER = ['flv', 'mp4', 'webm']
 886                 else:
 887                     ORDER = ['webm', 'flv', 'mp4']
 888                 try:
 889                     ext_preference = ORDER.index(f['ext'])
 890                 except ValueError:
 891                     ext_preference = -1
 892                 audio_ext_preference = 0
 893
 894             return (
 895                 preference,
 896                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 897                 f.get('quality') if f.get('quality') is not None else -1,
 898                 f.get('tbr') if f.get('tbr') is not None else -1,
 899                 f.get('filesize') if f.get('filesize') is not None else -1,
 900                 f.get('vbr') if f.get('vbr') is not None else -1,
 901                 f.get('height') if f.get('height') is not None else -1,
 902                 f.get('width') if f.get('width') is not None else -1,
 903                 proto_preference,
 904                 ext_preference,
 905                 f.get('abr') if f.get('abr') is not None else -1,
 906                 audio_ext_preference,
 907                 f.get('fps') if f.get('fps') is not None else -1,
 908                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 909                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 910                 f.get('format_id') if f.get('format_id') is not None else '',
 911             )
 912         formats.sort(key=_formats_key)
 913
 914     def _check_formats(self, formats, video_id):
 915         if formats:
 916             formats[:] = filter(
 917                 lambda f: self._is_valid_url(
 918                     f['url'], video_id,
 919                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 920                 formats)
 921
 922     @staticmethod
 923     def _remove_duplicate_formats(formats):
 924         format_urls = set()
 925         unique_formats = []
 926         for f in formats:
 927             if f['url'] not in format_urls:
 928                 format_urls.add(f['url'])
 929                 unique_formats.append(f)
 930         formats[:] = unique_formats
 931
 932     def _is_valid_url(self, url, video_id, item='video'):
 933         url = self._proto_relative_url(url, scheme='http:')
 934         # For now assume non HTTP(S) URLs always valid
 935         if not (url.startswith('http://') or url.startswith('https://')):
 936             return True
 937         try:
 938             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 939             return True
 940         except ExtractorError as e:
 941             if isinstance(e.cause, compat_urllib_error.URLError):
 942                 self.to_screen(
 943                     '%s: %s URL is invalid, skipping' % (video_id, item))
 944                 return False
 945             raise
 946
 947     def http_scheme(self):
 948         """ Either "http:" or "https:", depending on the user's preferences """
 949         return (
 950             'http:'
 951             if self._downloader.params.get('prefer_insecure', False)
 952             else 'https:')
 953
 954     def _proto_relative_url(self, url, scheme=None):
 955         if url is None:
 956             return url
 957         if url.startswith('//'):
 958             if scheme is None:
 959                 scheme = self.http_scheme()
 960             return scheme + url
 961         else:
 962             return url
 963
 964     def _sleep(self, timeout, video_id, msg_template=None):
 965         if msg_template is None:
 966             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 967         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 968         self.to_screen(msg)
 969         time.sleep(timeout)
 970
 971     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 972                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 973                              fatal=True):
 974         manifest = self._download_xml(
 975             manifest_url, video_id, 'Downloading f4m manifest',
 976             'Unable to download f4m manifest',
 977             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 978             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 979             transform_source=transform_source,
 980             fatal=fatal)
 981
 982         if manifest is False:
 983             return []
 984
 985         return self._parse_f4m_formats(
 986             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 987             transform_source=transform_source, fatal=fatal)
 988
 989     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 990                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 991                            fatal=True):
 992         formats = []
 993         manifest_version = '1.0'
 994         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 995         if not media_nodes:
 996             manifest_version = '2.0'
 997             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 998         # Remove unsupported DRM protected media from final formats
 999         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1000         media_nodes = remove_encrypted_media(media_nodes)
1001         if not media_nodes:
1002             return formats
1003         base_url = xpath_text(
1004             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1005             'base URL', default=None)
1006         if base_url:
1007             base_url = base_url.strip()
1008         for i, media_el in enumerate(media_nodes):
1009             if manifest_version == '2.0':
1010                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1011                 if not media_url:
1012                     continue
1013                 manifest_url = (
1014                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1015                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1016                 # If media_url is itself a f4m manifest do the recursive extraction
1017                 # since bitrates in parent manifest (this one) and media_url manifest
1018                 # may differ leading to inability to resolve the format by requested
1019                 # bitrate in f4m downloader
1020                 if determine_ext(manifest_url) == 'f4m':
1021                     formats.extend(self._extract_f4m_formats(
1022                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1023                         transform_source=transform_source, fatal=fatal))
1024                     continue
1025             tbr = int_or_none(media_el.attrib.get('bitrate'))
1026             formats.append({
1027                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1028                 'url': manifest_url,
1029                 'ext': 'flv',
1030                 'tbr': tbr,
1031                 'width': int_or_none(media_el.attrib.get('width')),
1032                 'height': int_or_none(media_el.attrib.get('height')),
1033                 'preference': preference,
1034             })
1035         return formats
1036
1037     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1038                               entry_protocol='m3u8', preference=None,
1039                               m3u8_id=None, note=None, errnote=None,
1040                               fatal=True):
1041
1042         formats = [{
1043             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1044             'url': m3u8_url,
1045             'ext': ext,
1046             'protocol': 'm3u8',
1047             'preference': preference - 1 if preference else -1,
1048             'resolution': 'multiple',
1049             'format_note': 'Quality selection URL',
1050         }]
1051
1052         format_url = lambda u: (
1053             u
1054             if re.match(r'^https?://', u)
1055             else compat_urlparse.urljoin(m3u8_url, u))
1056
1057         res = self._download_webpage_handle(
1058             m3u8_url, video_id,
1059             note=note or 'Downloading m3u8 information',
1060             errnote=errnote or 'Failed to download m3u8 information',
1061             fatal=fatal)
1062         if res is False:
1063             return []
1064         m3u8_doc, urlh = res
1065         m3u8_url = urlh.geturl()
1066
1067         # We should try extracting formats only from master playlists [1], i.e.
1068         # playlists that describe available qualities. On the other hand media
1069         # playlists [2] should be returned as is since they contain just the media
1070         # without qualities renditions.
1071         # Fortunately, master playlist can be easily distinguished from media
1072         # playlist based on particular tags availability. As of [1, 2] master
1073         # playlist tags MUST NOT appear in a media playist and vice versa.
1074         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1075         # and MUST NOT appear in master playlist thus we can clearly detect media
1076         # playlist with this criterion.
1077         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1078         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1079         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1080         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1081             return [{
1082                 'url': m3u8_url,
1083                 'format_id': m3u8_id,
1084                 'ext': ext,
1085                 'protocol': entry_protocol,
1086                 'preference': preference,
1087             }]
1088         last_info = None
1089         last_media = None
1090         kv_rex = re.compile(
1091             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1092         for line in m3u8_doc.splitlines():
1093             if line.startswith('#EXT-X-STREAM-INF:'):
1094                 last_info = {}
1095                 for m in kv_rex.finditer(line):
1096                     v = m.group('val')
1097                     if v.startswith('"'):
1098                         v = v[1:-1]
1099                     last_info[m.group('key')] = v
1100             elif line.startswith('#EXT-X-MEDIA:'):
1101                 last_media = {}
1102                 for m in kv_rex.finditer(line):
1103                     v = m.group('val')
1104                     if v.startswith('"'):
1105                         v = v[1:-1]
1106                     last_media[m.group('key')] = v
1107             elif line.startswith('#') or not line.strip():
1108                 continue
1109             else:
1110                 if last_info is None:
1111                     formats.append({'url': format_url(line)})
1112                     continue
1113                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1114                 format_id = []
1115                 if m3u8_id:
1116                     format_id.append(m3u8_id)
1117                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1118                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1119                 f = {
1120                     'format_id': '-'.join(format_id),
1121                     'url': format_url(line.strip()),
1122                     'tbr': tbr,
1123                     'ext': ext,
1124                     'protocol': entry_protocol,
1125                     'preference': preference,
1126                 }
1127                 resolution = last_info.get('RESOLUTION')
1128                 if resolution:
1129                     width_str, height_str = resolution.split('x')
1130                     f['width'] = int(width_str)
1131                     f['height'] = int(height_str)
1132                 codecs = last_info.get('CODECS')
1133                 if codecs:
1134                     vcodec, acodec = [None] * 2
1135                     va_codecs = codecs.split(',')
1136                     if len(va_codecs) == 1:
1137                         # Audio only entries usually come with single codec and
1138                         # no resolution. For more robustness we also check it to
1139                         # be mp4 audio.
1140                         if not resolution and va_codecs[0].startswith('mp4a'):
1141                             vcodec, acodec = 'none', va_codecs[0]
1142                         else:
1143                             vcodec = va_codecs[0]
1144                     else:
1145                         vcodec, acodec = va_codecs[:2]
1146                     f.update({
1147                         'acodec': acodec,
1148                         'vcodec': vcodec,
1149                     })
1150                 if last_media is not None:
1151                     f['m3u8_media'] = last_media
1152                     last_media = None
1153                 formats.append(f)
1154                 last_info = {}
1155         return formats
1156
1157     @staticmethod
1158     def _xpath_ns(path, namespace=None):
1159         if not namespace:
1160             return path
1161         out = []
1162         for c in path.split('/'):
1163             if not c or c == '.':
1164                 out.append(c)
1165             else:
1166                 out.append('{%s}%s' % (namespace, c))
1167         return '/'.join(out)
1168
1169     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1170         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1171
1172         if smil is False:
1173             assert not fatal
1174             return []
1175
1176         namespace = self._parse_smil_namespace(smil)
1177
1178         return self._parse_smil_formats(
1179             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1180
1181     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1182         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1183         if smil is False:
1184             return {}
1185         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1186
1187     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1188         return self._download_xml(
1189             smil_url, video_id, 'Downloading SMIL file',
1190             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1191
1192     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1193         namespace = self._parse_smil_namespace(smil)
1194
1195         formats = self._parse_smil_formats(
1196             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1197         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1198
1199         video_id = os.path.splitext(url_basename(smil_url))[0]
1200         title = None
1201         description = None
1202         upload_date = None
1203         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1204             name = meta.attrib.get('name')
1205             content = meta.attrib.get('content')
1206             if not name or not content:
1207                 continue
1208             if not title and name == 'title':
1209                 title = content
1210             elif not description and name in ('description', 'abstract'):
1211                 description = content
1212             elif not upload_date and name == 'date':
1213                 upload_date = unified_strdate(content)
1214
1215         thumbnails = [{
1216             'id': image.get('type'),
1217             'url': image.get('src'),
1218             'width': int_or_none(image.get('width')),
1219             'height': int_or_none(image.get('height')),
1220         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1221
1222         return {
1223             'id': video_id,
1224             'title': title or video_id,
1225             'description': description,
1226             'upload_date': upload_date,
1227             'thumbnails': thumbnails,
1228             'formats': formats,
1229             'subtitles': subtitles,
1230         }
1231
1232     def _parse_smil_namespace(self, smil):
1233         return self._search_regex(
1234             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1235
1236     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1237         base = smil_url
1238         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1239             b = meta.get('base') or meta.get('httpBase')
1240             if b:
1241                 base = b
1242                 break
1243
1244         formats = []
1245         rtmp_count = 0
1246         http_count = 0
1247         m3u8_count = 0
1248
1249         srcs = []
1250         videos = smil.findall(self._xpath_ns('.//video', namespace))
1251         for video in videos:
1252             src = video.get('src')
1253             if not src or src in srcs:
1254                 continue
1255             srcs.append(src)
1256
1257             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1258             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1259             width = int_or_none(video.get('width'))
1260             height = int_or_none(video.get('height'))
1261             proto = video.get('proto')
1262             ext = video.get('ext')
1263             src_ext = determine_ext(src)
1264             streamer = video.get('streamer') or base
1265
1266             if proto == 'rtmp' or streamer.startswith('rtmp'):
1267                 rtmp_count += 1
1268                 formats.append({
1269                     'url': streamer,
1270                     'play_path': src,
1271                     'ext': 'flv',
1272                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1273                     'tbr': bitrate,
1274                     'filesize': filesize,
1275                     'width': width,
1276                     'height': height,
1277                 })
1278                 if transform_rtmp_url:
1279                     streamer, src = transform_rtmp_url(streamer, src)
1280                     formats[-1].update({
1281                         'url': streamer,
1282                         'play_path': src,
1283                     })
1284                 continue
1285
1286             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1287             src_url = src_url.strip()
1288
1289             if proto == 'm3u8' or src_ext == 'm3u8':
1290                 m3u8_formats = self._extract_m3u8_formats(
1291                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1292                 if len(m3u8_formats) == 1:
1293                     m3u8_count += 1
1294                     m3u8_formats[0].update({
1295                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1296                         'tbr': bitrate,
1297                         'width': width,
1298                         'height': height,
1299                     })
1300                 formats.extend(m3u8_formats)
1301                 continue
1302
1303             if src_ext == 'f4m':
1304                 f4m_url = src_url
1305                 if not f4m_params:
1306                     f4m_params = {
1307                         'hdcore': '3.2.0',
1308                         'plugin': 'flowplayer-3.2.0.1',
1309                     }
1310                 f4m_url += '&' if '?' in f4m_url else '?'
1311                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1312                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1313                 continue
1314
1315             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1316                 http_count += 1
1317                 formats.append({
1318                     'url': src_url,
1319                     'ext': ext or src_ext or 'flv',
1320                     'format_id': 'http-%d' % (bitrate or http_count),
1321                     'tbr': bitrate,
1322                     'filesize': filesize,
1323                     'width': width,
1324                     'height': height,
1325                 })
1326                 continue
1327
1328         return formats
1329
1330     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1331         urls = []
1332         subtitles = {}
1333         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1334             src = textstream.get('src')
1335             if not src or src in urls:
1336                 continue
1337             urls.append(src)
1338             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1339             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1340             subtitles.setdefault(lang, []).append({
1341                 'url': src,
1342                 'ext': ext,
1343             })
1344         return subtitles
1345
1346     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1347         xspf = self._download_xml(
1348             playlist_url, playlist_id, 'Downloading xpsf playlist',
1349             'Unable to download xspf manifest', fatal=fatal)
1350         if xspf is False:
1351             return []
1352         return self._parse_xspf(xspf, playlist_id)
1353
1354     def _parse_xspf(self, playlist, playlist_id):
1355         NS_MAP = {
1356             'xspf': 'http://xspf.org/ns/0/',
1357             's1': 'http://static.streamone.nl/player/ns/0',
1358         }
1359
1360         entries = []
1361         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1362             title = xpath_text(
1363                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1364             description = xpath_text(
1365                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1366             thumbnail = xpath_text(
1367                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1368             duration = float_or_none(
1369                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1370
1371             formats = [{
1372                 'url': location.text,
1373                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1374                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1375                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1376             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1377             self._sort_formats(formats)
1378
1379             entries.append({
1380                 'id': playlist_id,
1381                 'title': title,
1382                 'description': description,
1383                 'thumbnail': thumbnail,
1384                 'duration': duration,
1385                 'formats': formats,
1386             })
1387         return entries
1388
1389     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1390         res = self._download_webpage_handle(
1391             mpd_url, video_id,
1392             note=note or 'Downloading MPD manifest',
1393             errnote=errnote or 'Failed to download MPD manifest',
1394             fatal=fatal)
1395         if res is False:
1396             return []
1397         mpd, urlh = res
1398         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1399
1400         return self._parse_mpd_formats(
1401             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1402
1403     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1404         if mpd_doc.get('type') == 'dynamic':
1405             return []
1406
1407         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1408
1409         def _add_ns(path):
1410             return self._xpath_ns(path, namespace)
1411
1412         def is_drm_protected(element):
1413             return element.find(_add_ns('ContentProtection')) is not None
1414
1415         def extract_multisegment_info(element, ms_parent_info):
1416             ms_info = ms_parent_info.copy()
1417             segment_list = element.find(_add_ns('SegmentList'))
1418             if segment_list is not None:
1419                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1420                 if segment_urls_e:
1421                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1422                 initialization = segment_list.find(_add_ns('Initialization'))
1423                 if initialization is not None:
1424                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1425             else:
1426                 segment_template = element.find(_add_ns('SegmentTemplate'))
1427                 if segment_template is not None:
1428                     start_number = segment_template.get('startNumber')
1429                     if start_number:
1430                         ms_info['start_number'] = int(start_number)
1431                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1432                     if segment_timeline is not None:
1433                         s_e = segment_timeline.findall(_add_ns('S'))
1434                         if s_e:
1435                             ms_info['total_number'] = 0
1436                             for s in s_e:
1437                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1438                     else:
1439                         timescale = segment_template.get('timescale')
1440                         if timescale:
1441                             ms_info['timescale'] = int(timescale)
1442                         segment_duration = segment_template.get('duration')
1443                         if segment_duration:
1444                             ms_info['segment_duration'] = int(segment_duration)
1445                     media_template = segment_template.get('media')
1446                     if media_template:
1447                         ms_info['media_template'] = media_template
1448                     initialization = segment_template.get('initialization')
1449                     if initialization:
1450                         ms_info['initialization_url'] = initialization
1451                     else:
1452                         initialization = segment_template.find(_add_ns('Initialization'))
1453                         if initialization is not None:
1454                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1455             return ms_info
1456
1457         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1458         formats = []
1459         for period in mpd_doc.findall(_add_ns('Period')):
1460             period_duration = parse_duration(period.get('duration')) or mpd_duration
1461             period_ms_info = extract_multisegment_info(period, {
1462                 'start_number': 1,
1463                 'timescale': 1,
1464             })
1465             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1466                 if is_drm_protected(adaptation_set):
1467                     continue
1468                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1469                 for representation in adaptation_set.findall(_add_ns('Representation')):
1470                     if is_drm_protected(representation):
1471                         continue
1472                     representation_attrib = adaptation_set.attrib.copy()
1473                     representation_attrib.update(representation.attrib)
1474                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1475                     mime_type = representation_attrib['mimeType']
1476                     content_type = mime_type.split('/')[0]
1477                     if content_type == 'text':
1478                         # TODO implement WebVTT downloading
1479                         pass
1480                     elif content_type == 'video' or content_type == 'audio':
1481                         base_url = ''
1482                         for element in (representation, adaptation_set, period, mpd_doc):
1483                             base_url_e = element.find(_add_ns('BaseURL'))
1484                             if base_url_e is not None:
1485                                 base_url = base_url_e.text + base_url
1486                                 if re.match(r'^https?://', base_url):
1487                                     break
1488                         if mpd_base_url and not re.match(r'^https?://', base_url):
1489                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1490                                 mpd_base_url += '/'
1491                             base_url = mpd_base_url + base_url
1492                         representation_id = representation_attrib.get('id')
1493                         lang = representation_attrib.get('lang')
1494                         url_el = representation.find(_add_ns('BaseURL'))
1495                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1496                         f = {
1497                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1498                             'url': base_url,
1499                             'ext': mimetype2ext(mime_type),
1500                             'width': int_or_none(representation_attrib.get('width')),
1501                             'height': int_or_none(representation_attrib.get('height')),
1502                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1503                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1504                             'fps': int_or_none(representation_attrib.get('frameRate')),
1505                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1506                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1507                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1508                             'format_note': 'DASH %s' % content_type,
1509                             'filesize': filesize,
1510                         }
1511                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1512                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1513                             if 'total_number' not in representation_ms_info and 'segment_duration':
1514                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1515                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1516                             media_template = representation_ms_info['media_template']
1517                             media_template = media_template.replace('$RepresentationID$', representation_id)
1518                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1519                             media_template.replace('$$', '$')
1520                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1521                         if 'segment_urls' in representation_ms_info:
1522                             f.update({
1523                                 'segment_urls': representation_ms_info['segment_urls'],
1524                                 'protocol': 'http_dash_segments',
1525                             })
1526                             if 'initialization_url' in representation_ms_info:
1527                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1528                                 f.update({
1529                                     'initialization_url': initialization_url,
1530                                 })
1531                                 if not f.get('url'):
1532                                     f['url'] = initialization_url
1533                         try:
1534                             existing_format = next(
1535                                 fo for fo in formats
1536                                 if fo['format_id'] == representation_id)
1537                         except StopIteration:
1538                             full_info = formats_dict.get(representation_id, {}).copy()
1539                             full_info.update(f)
1540                             formats.append(full_info)
1541                         else:
1542                             existing_format.update(f)
1543                     else:
1544                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1545         return formats
1546
1547     def _live_title(self, name):
1548         """ Generate the title for a live video """
1549         now = datetime.datetime.now()
1550         now_str = now.strftime('%Y-%m-%d %H:%M')
1551         return name + ' ' + now_str
1552
1553     def _int(self, v, name, fatal=False, **kwargs):
1554         res = int_or_none(v, **kwargs)
1555         if 'get_attr' in kwargs:
1556             print(getattr(v, kwargs['get_attr']))
1557         if res is None:
1558             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1559             if fatal:
1560                 raise ExtractorError(msg)
1561             else:
1562                 self._downloader.report_warning(msg)
1563         return res
1564
1565     def _float(self, v, name, fatal=False, **kwargs):
1566         res = float_or_none(v, **kwargs)
1567         if res is None:
1568             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1569             if fatal:
1570                 raise ExtractorError(msg)
1571             else:
1572                 self._downloader.report_warning(msg)
1573         return res
1574
1575     def _set_cookie(self, domain, name, value, expire_time=None):
1576         cookie = compat_cookiejar.Cookie(
1577             0, name, value, None, None, domain, None,
1578             None, '/', True, False, expire_time, '', None, None, None)
1579         self._downloader.cookiejar.set_cookie(cookie)
1580
1581     def _get_cookies(self, url):
1582         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1583         req = sanitized_Request(url)
1584         self._downloader.cookiejar.add_cookie_header(req)
1585         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1586
1587     def get_testcases(self, include_onlymatching=False):
1588         t = getattr(self, '_TEST', None)
1589         if t:
1590             assert not hasattr(self, '_TESTS'), \
1591                 '%s has _TEST and _TESTS' % type(self).__name__
1592             tests = [t]
1593         else:
1594             tests = getattr(self, '_TESTS', [])
1595         for t in tests:
1596             if not include_onlymatching and t.get('only_matching', False):
1597                 continue
1598             t['name'] = type(self).__name__[:-len('IE')]
1599             yield t
1600
1601     def is_suitable(self, age_limit):
1602         """ Test whether the extractor is generally suitable for the given
1603         age limit (i.e. pornographic sites are not, all others usually are) """
1604
1605         any_restricted = False
1606         for tc in self.get_testcases(include_onlymatching=False):
1607             if 'playlist' in tc:
1608                 tc = tc['playlist'][0]
1609             is_restricted = age_restricted(
1610                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1611             if not is_restricted:
1612                 return True
1613             any_restricted = any_restricted or is_restricted
1614         return not any_restricted
1615
1616     def extract_subtitles(self, *args, **kwargs):
1617         if (self._downloader.params.get('writesubtitles', False) or
1618                 self._downloader.params.get('listsubtitles')):
1619             return self._get_subtitles(*args, **kwargs)
1620         return {}
1621
1622     def _get_subtitles(self, *args, **kwargs):
1623         raise NotImplementedError('This method must be implemented by subclasses')
1624
1625     @staticmethod
1626     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1627         """ Merge subtitle items for one language. Items with duplicated URLs
1628         will be dropped. """
1629         list1_urls = set([item['url'] for item in subtitle_list1])
1630         ret = list(subtitle_list1)
1631         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1632         return ret
1633
1634     @classmethod
1635     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1636         """ Merge two subtitle dictionaries, language by language. """
1637         ret = dict(subtitle_dict1)
1638         for lang in subtitle_dict2:
1639             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1640         return ret
1641
1642     def extract_automatic_captions(self, *args, **kwargs):
1643         if (self._downloader.params.get('writeautomaticsub', False) or
1644                 self._downloader.params.get('listsubtitles')):
1645             return self._get_automatic_captions(*args, **kwargs)
1646         return {}
1647
1648     def _get_automatic_captions(self, *args, **kwargs):
1649         raise NotImplementedError('This method must be implemented by subclasses')
1650
1651     def mark_watched(self, *args, **kwargs):
1652         if (self._downloader.params.get('mark_watched', False) and
1653                 (self._get_login_info()[0] is not None or
1654                     self._downloader.params.get('cookiefile') is not None)):
1655             self._mark_watched(*args, **kwargs)
1656
1657     def _mark_watched(self, *args, **kwargs):
1658         raise NotImplementedError('This method must be implemented by subclasses')
1659
1660
1661 class SearchInfoExtractor(InfoExtractor):
1662     """
1663     Base class for paged search queries extractors.
1664     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1665     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1666     """
1667
1668     @classmethod
1669     def _make_valid_url(cls):
1670         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1671
1672     @classmethod
1673     def suitable(cls, url):
1674         return re.match(cls._make_valid_url(), url) is not None
1675
1676     def _real_extract(self, query):
1677         mobj = re.match(self._make_valid_url(), query)
1678         if mobj is None:
1679             raise ExtractorError('Invalid search query "%s"' % query)
1680
1681         prefix = mobj.group('prefix')
1682         query = mobj.group('query')
1683         if prefix == '':
1684             return self._get_n_results(query, 1)
1685         elif prefix == 'all':
1686             return self._get_n_results(query, self._MAX_RESULTS)
1687         else:
1688             n = int(prefix)
1689             if n <= 0:
1690                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1691             elif n > self._MAX_RESULTS:
1692                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1693                 n = self._MAX_RESULTS
1694             return self._get_n_results(query, n)
1695
1696     def _get_n_results(self, query, n):
1697         """Get a specified number of results for a query"""
1698         raise NotImplementedError('This method must be implemented by subclasses')
1699
1700     @property
1701     def SEARCH_KEY(self):
1702         return self._SEARCH_KEY