_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_parse_qs,
  24     compat_str,
  25     compat_etree_fromstring,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51 )
  52
  53
  54 class InfoExtractor(object):
  55     """Information Extractor class.
  56
  57     Information extractors are the classes that, given a URL, extract
  58     information about the video (or videos) the URL refers to. This
  59     information includes the real video URL, the video title, author and
  60     others. The information is stored in a dictionary which is then
  61     passed to the YoutubeDL. The YoutubeDL processes this
  62     information possibly downloading the video to the file system, among
  63     other possible outcomes.
  64
  65     The type field determines the type of the result.
  66     By far the most common value (and the default if _type is missing) is
  67     "video", which indicates a single video.
  68
  69     For a video, the dictionaries must include the following fields:
  70
  71     id:             Video identifier.
  72     title:          Video title, unescaped.
  73
  74     Additionally, it must contain either a formats entry or a url one:
  75
  76     formats:        A list of dictionaries for each format available, ordered
  77                     from worst to best quality.
  78
  79                     Potential fields:
  80                     * url        Mandatory. The URL of the video file
  81                     * ext        Will be calculated from URL if missing
  82                     * format     A human-readable description of the format
  83                                  ("mp4 container with h264/opus").
  84                                  Calculated from the format_id, width, height.
  85                                  and format_note fields if missing.
  86                     * format_id  A short description of the format
  87                                  ("mp4_h264_opus" or "19").
  88                                 Technically optional, but strongly recommended.
  89                     * format_note Additional info about the format
  90                                  ("3D" or "DASH video")
  91                     * width      Width of the video, if known
  92                     * height     Height of the video, if known
  93                     * resolution Textual description of width and height
  94                     * tbr        Average bitrate of audio and video in KBit/s
  95                     * abr        Average audio bitrate in KBit/s
  96                     * acodec     Name of the audio codec in use
  97                     * asr        Audio sampling rate in Hertz
  98                     * vbr        Average video bitrate in KBit/s
  99                     * fps        Frame rate
 100                     * vcodec     Name of the video codec in use
 101                     * container  Name of the container format
 102                     * filesize   The number of bytes, if known in advance
 103                     * filesize_approx  An estimate for the number of bytes
 104                     * player_url SWF Player URL (used for rtmpdump).
 105                     * protocol   The protocol that will be used for the actual
 106                                  download, lower-case.
 107                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 108                                  "m3u8", or "m3u8_native".
 109                     * preference Order number of this format. If this field is
 110                                  present and not None, the formats get sorted
 111                                  by this field, regardless of all other values.
 112                                  -1 for default (order by other properties),
 113                                  -2 or smaller for less than default.
 114                                  < -1000 to hide the format (if there is
 115                                     another one which is strictly better)
 116                     * language   Language code, e.g. "de" or "en-US".
 117                     * language_preference  Is this in the language mentioned in
 118                                  the URL?
 119                                  10 if it's what the URL is about,
 120                                  -1 for default (don't know),
 121                                  -10 otherwise, other values reserved for now.
 122                     * quality    Order number of the video quality of this
 123                                  format, irrespective of the file format.
 124                                  -1 for default (order by other properties),
 125                                  -2 or smaller for less than default.
 126                     * source_preference  Order number for this video source
 127                                   (quality takes higher priority)
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                     * http_headers  A dictionary of additional HTTP headers
 131                                  to add to the request.
 132                     * stretched_ratio  If given and not 1, indicates that the
 133                                  video's pixels are not square.
 134                                  width : height ratio as float.
 135                     * no_resume  The server does not support resuming the
 136                                  (HTTP or RTMP) download. Boolean.
 137
 138     url:            Final video URL.
 139     ext:            Video filename extension.
 140     format:         The video format, defaults to ext (used for --get-format)
 141     player_url:     SWF Player URL (used for rtmpdump).
 142
 143     The following fields are optional:
 144
 145     alt_title:      A secondary title of the video.
 146     display_id      An alternative identifier for the video, not necessarily
 147                     unique, but available before title. Typically, id is
 148                     something like "4234987", title "Dancing naked mole rats",
 149                     and display_id "dancing-naked-mole-rats"
 150     thumbnails:     A list of dictionaries, with the following entries:
 151                         * "id" (optional, string) - Thumbnail format ID
 152                         * "url"
 153                         * "preference" (optional, int) - quality of the image
 154                         * "width" (optional, int)
 155                         * "height" (optional, int)
 156                         * "resolution" (optional, string "{width}x{height"},
 157                                         deprecated)
 158     thumbnail:      Full URL to a video thumbnail image.
 159     description:    Full video description.
 160     uploader:       Full name of the video uploader.
 161     license:        License name the video is licensed under.
 162     creator:        The main artist who created the video.
 163     release_date:   The date (YYYYMMDD) when the video was released.
 164     timestamp:      UNIX timestamp of the moment the video became available.
 165     upload_date:    Video upload date (YYYYMMDD).
 166                     If not explicitly set, calculated from timestamp.
 167     uploader_id:    Nickname or id of the video uploader.
 168     uploader_url:   Full URL to a personal webpage of the video uploader.
 169     location:       Physical location where the video was filmed.
 170     subtitles:      The available subtitles as a dictionary in the format
 171                     {language: subformats}. "subformats" is a list sorted from
 172                     lower to higher preference, each element is a dictionary
 173                     with the "ext" entry and one of:
 174                         * "data": The subtitles file contents
 175                         * "url": A URL pointing to the subtitles file
 176                     "ext" will be calculated from URL if missing
 177     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 178                     automatically generated captions
 179     duration:       Length of the video in seconds, as an integer or float.
 180     view_count:     How many users have watched the video on the platform.
 181     like_count:     Number of positive ratings of the video
 182     dislike_count:  Number of negative ratings of the video
 183     repost_count:   Number of reposts of the video
 184     average_rating: Average rating give by users, the scale used depends on the webpage
 185     comment_count:  Number of comments on the video
 186     comments:       A list of comments, each with one or more of the following
 187                     properties (all but one of text or html optional):
 188                         * "author" - human-readable name of the comment author
 189                         * "author_id" - user ID of the comment author
 190                         * "id" - Comment ID
 191                         * "html" - Comment as HTML
 192                         * "text" - Plain text of the comment
 193                         * "timestamp" - UNIX timestamp of comment
 194                         * "parent" - ID of the comment this one is replying to.
 195                                      Set to "root" to indicate that this is a
 196                                      comment to the original video.
 197     age_limit:      Age restriction for the video, as an integer (years)
 198     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 199                     should allow to get the same result again. (It will be set
 200                     by YoutubeDL if it's missing)
 201     categories:     A list of categories that the video falls in, for example
 202                     ["Sports", "Berlin"]
 203     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 204     is_live:        True, False, or None (=unknown). Whether this video is a
 205                     live stream that goes on instead of a fixed-length video.
 206     start_time:     Time in seconds where the reproduction should start, as
 207                     specified in the URL.
 208     end_time:       Time in seconds where the reproduction should end, as
 209                     specified in the URL.
 210
 211     The following fields should only be used when the video belongs to some logical
 212     chapter or section:
 213
 214     chapter:        Name or title of the chapter the video belongs to.
 215     chapter_number: Number of the chapter the video belongs to, as an integer.
 216     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 217
 218     The following fields should only be used when the video is an episode of some
 219     series or programme:
 220
 221     series:         Title of the series or programme the video episode belongs to.
 222     season:         Title of the season the video episode belongs to.
 223     season_number:  Number of the season the video episode belongs to, as an integer.
 224     season_id:      Id of the season the video episode belongs to, as a unicode string.
 225     episode:        Title of the video episode. Unlike mandatory video title field,
 226                     this field should denote the exact title of the video episode
 227                     without any kind of decoration.
 228     episode_number: Number of the video episode within a season, as an integer.
 229     episode_id:     Id of the video episode, as a unicode string.
 230
 231     Unless mentioned otherwise, the fields should be Unicode strings.
 232
 233     Unless mentioned otherwise, None is equivalent to absence of information.
 234
 235
 236     _type "playlist" indicates multiple videos.
 237     There must be a key "entries", which is a list, an iterable, or a PagedList
 238     object, each element of which is a valid dictionary by this specification.
 239
 240     Additionally, playlists can have "title", "description" and "id" attributes
 241     with the same semantics as videos (see above).
 242
 243
 244     _type "multi_video" indicates that there are multiple videos that
 245     form a single show, for examples multiple acts of an opera or TV episode.
 246     It must have an entries key like a playlist and contain all the keys
 247     required for a video at the same time.
 248
 249
 250     _type "url" indicates that the video must be extracted from another
 251     location, possibly by a different extractor. Its only required key is:
 252     "url" - the next URL to extract.
 253     The key "ie_key" can be set to the class name (minus the trailing "IE",
 254     e.g. "Youtube") if the extractor class is known in advance.
 255     Additionally, the dictionary may have any properties of the resolved entity
 256     known in advance, for example "title" if the title of the referred video is
 257     known ahead of time.
 258
 259
 260     _type "url_transparent" entities have the same specification as "url", but
 261     indicate that the given additional information is more precise than the one
 262     associated with the resolved URL.
 263     This is useful when a site employs a video service that hosts the video and
 264     its technical metadata, but that video service does not embed a useful
 265     title, description etc.
 266
 267
 268     Subclasses of this one should re-define the _real_initialize() and
 269     _real_extract() methods and define a _VALID_URL regexp.
 270     Probably, they should also be added to the list of extractors.
 271
 272     Finally, the _WORKING attribute should be set to False for broken IEs
 273     in order to warn the users and skip the tests.
 274     """
 275
 276     _ready = False
 277     _downloader = None
 278     _WORKING = True
 279
 280     def __init__(self, downloader=None):
 281         """Constructor. Receives an optional downloader."""
 282         self._ready = False
 283         self.set_downloader(downloader)
 284
 285     @classmethod
 286     def suitable(cls, url):
 287         """Receives a URL and returns True if suitable for this IE."""
 288
 289         # This does not use has/getattr intentionally - we want to know whether
 290         # we have cached the regexp for *this* class, whereas getattr would also
 291         # match the superclass
 292         if '_VALID_URL_RE' not in cls.__dict__:
 293             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 294         return cls._VALID_URL_RE.match(url) is not None
 295
 296     @classmethod
 297     def _match_id(cls, url):
 298         if '_VALID_URL_RE' not in cls.__dict__:
 299             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 300         m = cls._VALID_URL_RE.match(url)
 301         assert m
 302         return m.group('id')
 303
 304     @classmethod
 305     def working(cls):
 306         """Getter method for _WORKING."""
 307         return cls._WORKING
 308
 309     def initialize(self):
 310         """Initializes an instance (authentication, etc)."""
 311         if not self._ready:
 312             self._real_initialize()
 313             self._ready = True
 314
 315     def extract(self, url):
 316         """Extracts URL information and returns it in list of dicts."""
 317         try:
 318             self.initialize()
 319             return self._real_extract(url)
 320         except ExtractorError:
 321             raise
 322         except compat_http_client.IncompleteRead as e:
 323             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 324         except (KeyError, StopIteration) as e:
 325             raise ExtractorError('An extractor error has occurred.', cause=e)
 326
 327     def set_downloader(self, downloader):
 328         """Sets the downloader for this IE."""
 329         self._downloader = downloader
 330
 331     def _real_initialize(self):
 332         """Real initialization process. Redefine in subclasses."""
 333         pass
 334
 335     def _real_extract(self, url):
 336         """Real extraction process. Redefine in subclasses."""
 337         pass
 338
 339     @classmethod
 340     def ie_key(cls):
 341         """A string for getting the InfoExtractor with get_info_extractor"""
 342         return compat_str(cls.__name__[:-2])
 343
 344     @property
 345     def IE_NAME(self):
 346         return compat_str(type(self).__name__[:-2])
 347
 348     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 349         """ Returns the response handle """
 350         if note is None:
 351             self.report_download_webpage(video_id)
 352         elif note is not False:
 353             if video_id is None:
 354                 self.to_screen('%s' % (note,))
 355             else:
 356                 self.to_screen('%s: %s' % (video_id, note))
 357         try:
 358             return self._downloader.urlopen(url_or_request)
 359         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 360             if errnote is False:
 361                 return False
 362             if errnote is None:
 363                 errnote = 'Unable to download webpage'
 364
 365             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 366             if fatal:
 367                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 368             else:
 369                 self._downloader.report_warning(errmsg)
 370                 return False
 371
 372     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 373         """ Returns a tuple (page content as string, URL handle) """
 374         # Strip hashes from the URL (#1038)
 375         if isinstance(url_or_request, (compat_str, str)):
 376             url_or_request = url_or_request.partition('#')[0]
 377
 378         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 379         if urlh is False:
 380             assert not fatal
 381             return False
 382         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 383         return (content, urlh)
 384
 385     @staticmethod
 386     def _guess_encoding_from_content(content_type, webpage_bytes):
 387         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 388         if m:
 389             encoding = m.group(1)
 390         else:
 391             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 392                           webpage_bytes[:1024])
 393             if m:
 394                 encoding = m.group(1).decode('ascii')
 395             elif webpage_bytes.startswith(b'\xff\xfe'):
 396                 encoding = 'utf-16'
 397             else:
 398                 encoding = 'utf-8'
 399
 400         return encoding
 401
 402     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 403         content_type = urlh.headers.get('Content-Type', '')
 404         webpage_bytes = urlh.read()
 405         if prefix is not None:
 406             webpage_bytes = prefix + webpage_bytes
 407         if not encoding:
 408             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 409         if self._downloader.params.get('dump_intermediate_pages', False):
 410             try:
 411                 url = url_or_request.get_full_url()
 412             except AttributeError:
 413                 url = url_or_request
 414             self.to_screen('Dumping request to ' + url)
 415             dump = base64.b64encode(webpage_bytes).decode('ascii')
 416             self._downloader.to_screen(dump)
 417         if self._downloader.params.get('write_pages', False):
 418             try:
 419                 url = url_or_request.get_full_url()
 420             except AttributeError:
 421                 url = url_or_request
 422             basen = '%s_%s' % (video_id, url)
 423             if len(basen) > 240:
 424                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 425                 basen = basen[:240 - len(h)] + h
 426             raw_filename = basen + '.dump'
 427             filename = sanitize_filename(raw_filename, restricted=True)
 428             self.to_screen('Saving request to ' + filename)
 429             # Working around MAX_PATH limitation on Windows (see
 430             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 431             if os.name == 'nt':
 432                 absfilepath = os.path.abspath(filename)
 433                 if len(absfilepath) > 259:
 434                     filename = '\\\\?\\' + absfilepath
 435             with open(filename, 'wb') as outf:
 436                 outf.write(webpage_bytes)
 437
 438         try:
 439             content = webpage_bytes.decode(encoding, 'replace')
 440         except LookupError:
 441             content = webpage_bytes.decode('utf-8', 'replace')
 442
 443         if ('<title>Access to this site is blocked</title>' in content and
 444                 'Websense' in content[:512]):
 445             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 446             blocked_iframe = self._html_search_regex(
 447                 r'<iframe src="([^"]+)"', content,
 448                 'Websense information URL', default=None)
 449             if blocked_iframe:
 450                 msg += ' Visit %s for more details' % blocked_iframe
 451             raise ExtractorError(msg, expected=True)
 452         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 453             msg = (
 454                 'Access to this webpage has been blocked by Indian censorship. '
 455                 'Use a VPN or proxy server (with --proxy) to route around it.')
 456             block_msg = self._html_search_regex(
 457                 r'</h1><p>(.*?)</p>',
 458                 content, 'block message', default=None)
 459             if block_msg:
 460                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 461             raise ExtractorError(msg, expected=True)
 462
 463         return content
 464
 465     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 466         """ Returns the data of the page as a string """
 467         success = False
 468         try_count = 0
 469         while success is False:
 470             try:
 471                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 472                 success = True
 473             except compat_http_client.IncompleteRead as e:
 474                 try_count += 1
 475                 if try_count >= tries:
 476                     raise e
 477                 self._sleep(timeout, video_id)
 478         if res is False:
 479             return res
 480         else:
 481             content, _ = res
 482             return content
 483
 484     def _download_xml(self, url_or_request, video_id,
 485                       note='Downloading XML', errnote='Unable to download XML',
 486                       transform_source=None, fatal=True, encoding=None):
 487         """Return the xml as an xml.etree.ElementTree.Element"""
 488         xml_string = self._download_webpage(
 489             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 490         if xml_string is False:
 491             return xml_string
 492         if transform_source:
 493             xml_string = transform_source(xml_string)
 494         return compat_etree_fromstring(xml_string.encode('utf-8'))
 495
 496     def _download_json(self, url_or_request, video_id,
 497                        note='Downloading JSON metadata',
 498                        errnote='Unable to download JSON metadata',
 499                        transform_source=None,
 500                        fatal=True, encoding=None):
 501         json_string = self._download_webpage(
 502             url_or_request, video_id, note, errnote, fatal=fatal,
 503             encoding=encoding)
 504         if (not fatal) and json_string is False:
 505             return None
 506         return self._parse_json(
 507             json_string, video_id, transform_source=transform_source, fatal=fatal)
 508
 509     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 510         if transform_source:
 511             json_string = transform_source(json_string)
 512         try:
 513             return json.loads(json_string)
 514         except ValueError as ve:
 515             errmsg = '%s: Failed to parse JSON ' % video_id
 516             if fatal:
 517                 raise ExtractorError(errmsg, cause=ve)
 518             else:
 519                 self.report_warning(errmsg + str(ve))
 520
 521     def update_url_params(self, url, params):
 522         parsed_url = compat_urlparse.urlparse(url)
 523         qs = compat_parse_qs(parsed_url.query)
 524         qs.update(params)
 525         return compat_urlparse.urlunparse(
 526             parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
 527
 528     def report_warning(self, msg, video_id=None):
 529         idstr = '' if video_id is None else '%s: ' % video_id
 530         self._downloader.report_warning(
 531             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 532
 533     def to_screen(self, msg):
 534         """Print msg to screen, prefixing it with '[ie_name]'"""
 535         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 536
 537     def report_extraction(self, id_or_name):
 538         """Report information extraction."""
 539         self.to_screen('%s: Extracting information' % id_or_name)
 540
 541     def report_download_webpage(self, video_id):
 542         """Report webpage download."""
 543         self.to_screen('%s: Downloading webpage' % video_id)
 544
 545     def report_age_confirmation(self):
 546         """Report attempt to confirm age."""
 547         self.to_screen('Confirming age')
 548
 549     def report_login(self):
 550         """Report attempt to log in."""
 551         self.to_screen('Logging in')
 552
 553     @staticmethod
 554     def raise_login_required(msg='This video is only available for registered users'):
 555         raise ExtractorError(
 556             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 557             expected=True)
 558
 559     @staticmethod
 560     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 561         raise ExtractorError(
 562             '%s. You might want to use --proxy to workaround.' % msg,
 563             expected=True)
 564
 565     # Methods for following #608
 566     @staticmethod
 567     def url_result(url, ie=None, video_id=None, video_title=None):
 568         """Returns a URL that points to a page that should be processed"""
 569         # TODO: ie should be the class used for getting the info
 570         video_info = {'_type': 'url',
 571                       'url': url,
 572                       'ie_key': ie}
 573         if video_id is not None:
 574             video_info['id'] = video_id
 575         if video_title is not None:
 576             video_info['title'] = video_title
 577         return video_info
 578
 579     @staticmethod
 580     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 581         """Returns a playlist"""
 582         video_info = {'_type': 'playlist',
 583                       'entries': entries}
 584         if playlist_id:
 585             video_info['id'] = playlist_id
 586         if playlist_title:
 587             video_info['title'] = playlist_title
 588         if playlist_description:
 589             video_info['description'] = playlist_description
 590         return video_info
 591
 592     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Perform a regex search on the given string, using a single or a list of
 595         patterns returning the first matching group.
 596         In case of failure return a default value or raise a WARNING or a
 597         RegexNotFoundError, depending on fatal, specifying the field name.
 598         """
 599         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 600             mobj = re.search(pattern, string, flags)
 601         else:
 602             for p in pattern:
 603                 mobj = re.search(p, string, flags)
 604                 if mobj:
 605                     break
 606
 607         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 608             _name = '\033[0;34m%s\033[0m' % name
 609         else:
 610             _name = name
 611
 612         if mobj:
 613             if group is None:
 614                 # return the first matching group
 615                 return next(g for g in mobj.groups() if g is not None)
 616             else:
 617                 return mobj.group(group)
 618         elif default is not NO_DEFAULT:
 619             return default
 620         elif fatal:
 621             raise RegexNotFoundError('Unable to extract %s' % _name)
 622         else:
 623             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 624             return None
 625
 626     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 627         """
 628         Like _search_regex, but strips HTML tags and unescapes entities.
 629         """
 630         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 631         if res:
 632             return clean_html(res).strip()
 633         else:
 634             return res
 635
 636     def _get_login_info(self):
 637         """
 638         Get the login info as (username, password)
 639         It will look in the netrc file using the _NETRC_MACHINE value
 640         If there's no info available, return (None, None)
 641         """
 642         if self._downloader is None:
 643             return (None, None)
 644
 645         username = None
 646         password = None
 647         downloader_params = self._downloader.params
 648
 649         # Attempt to use provided username and password or .netrc data
 650         if downloader_params.get('username') is not None:
 651             username = downloader_params['username']
 652             password = downloader_params['password']
 653         elif downloader_params.get('usenetrc', False):
 654             try:
 655                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 656                 if info is not None:
 657                     username = info[0]
 658                     password = info[2]
 659                 else:
 660                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 661             except (IOError, netrc.NetrcParseError) as err:
 662                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 663
 664         return (username, password)
 665
 666     def _get_tfa_info(self, note='two-factor verification code'):
 667         """
 668         Get the two-factor authentication info
 669         TODO - asking the user will be required for sms/phone verify
 670         currently just uses the command line option
 671         If there's no info available, return None
 672         """
 673         if self._downloader is None:
 674             return None
 675         downloader_params = self._downloader.params
 676
 677         if downloader_params.get('twofactor') is not None:
 678             return downloader_params['twofactor']
 679
 680         return compat_getpass('Type %s and press [Return]: ' % note)
 681
 682     # Helper functions for extracting OpenGraph info
 683     @staticmethod
 684     def _og_regexes(prop):
 685         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 686         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 687                        % {'prop': re.escape(prop)})
 688         template = r'<meta[^>]+?%s[^>]+?%s'
 689         return [
 690             template % (property_re, content_re),
 691             template % (content_re, property_re),
 692         ]
 693
 694     @staticmethod
 695     def _meta_regex(prop):
 696         return r'''(?isx)<meta
 697                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 698                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 699
 700     def _og_search_property(self, prop, html, name=None, **kargs):
 701         if name is None:
 702             name = 'OpenGraph %s' % prop
 703         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 704         if escaped is None:
 705             return None
 706         return unescapeHTML(escaped)
 707
 708     def _og_search_thumbnail(self, html, **kargs):
 709         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 710
 711     def _og_search_description(self, html, **kargs):
 712         return self._og_search_property('description', html, fatal=False, **kargs)
 713
 714     def _og_search_title(self, html, **kargs):
 715         return self._og_search_property('title', html, **kargs)
 716
 717     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 718         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 719         if secure:
 720             regexes = self._og_regexes('video:secure_url') + regexes
 721         return self._html_search_regex(regexes, html, name, **kargs)
 722
 723     def _og_search_url(self, html, **kargs):
 724         return self._og_search_property('url', html, **kargs)
 725
 726     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 727         if display_name is None:
 728             display_name = name
 729         return self._html_search_regex(
 730             self._meta_regex(name),
 731             html, display_name, fatal=fatal, group='content', **kwargs)
 732
 733     def _dc_search_uploader(self, html):
 734         return self._html_search_meta('dc.creator', html, 'uploader')
 735
 736     def _rta_search(self, html):
 737         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 738         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 739                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 740                      html):
 741             return 18
 742         return 0
 743
 744     def _media_rating_search(self, html):
 745         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 746         rating = self._html_search_meta('rating', html)
 747
 748         if not rating:
 749             return None
 750
 751         RATING_TABLE = {
 752             'safe for kids': 0,
 753             'general': 8,
 754             '14 years': 14,
 755             'mature': 17,
 756             'restricted': 19,
 757         }
 758         return RATING_TABLE.get(rating.lower())
 759
 760     def _family_friendly_search(self, html):
 761         # See http://schema.org/VideoObject
 762         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 763
 764         if not family_friendly:
 765             return None
 766
 767         RATING_TABLE = {
 768             '1': 0,
 769             'true': 0,
 770             '0': 18,
 771             'false': 18,
 772         }
 773         return RATING_TABLE.get(family_friendly.lower())
 774
 775     def _twitter_search_player(self, html):
 776         return self._html_search_meta('twitter:player', html,
 777                                       'twitter card player')
 778
 779     def _search_json_ld(self, html, video_id, **kwargs):
 780         json_ld = self._search_regex(
 781             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 782             html, 'JSON-LD', group='json_ld', **kwargs)
 783         if not json_ld:
 784             return {}
 785         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 786
 787     def _json_ld(self, json_ld, video_id, fatal=True):
 788         if isinstance(json_ld, compat_str):
 789             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 790         if not json_ld:
 791             return {}
 792         info = {}
 793         if json_ld.get('@context') == 'http://schema.org':
 794             item_type = json_ld.get('@type')
 795             if item_type == 'TVEpisode':
 796                 info.update({
 797                     'episode': unescapeHTML(json_ld.get('name')),
 798                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 799                     'description': unescapeHTML(json_ld.get('description')),
 800                 })
 801                 part_of_season = json_ld.get('partOfSeason')
 802                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 803                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 804                 part_of_series = json_ld.get('partOfSeries')
 805                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 806                     info['series'] = unescapeHTML(part_of_series.get('name'))
 807             elif item_type == 'Article':
 808                 info.update({
 809                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 810                     'title': unescapeHTML(json_ld.get('headline')),
 811                     'description': unescapeHTML(json_ld.get('articleBody')),
 812                 })
 813         return dict((k, v) for k, v in info.items() if v is not None)
 814
 815     @staticmethod
 816     def _hidden_inputs(html):
 817         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 818         hidden_inputs = {}
 819         for input in re.findall(r'(?i)<input([^>]+)>', html):
 820             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 821                 continue
 822             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 823             if not name:
 824                 continue
 825             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 826             if not value:
 827                 continue
 828             hidden_inputs[name.group('value')] = value.group('value')
 829         return hidden_inputs
 830
 831     def _form_hidden_inputs(self, form_id, html):
 832         form = self._search_regex(
 833             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 834             html, '%s form' % form_id, group='form')
 835         return self._hidden_inputs(form)
 836
 837     def _sort_formats(self, formats, field_preference=None):
 838         if not formats:
 839             raise ExtractorError('No video formats found')
 840
 841         for f in formats:
 842             # Automatically determine tbr when missing based on abr and vbr (improves
 843             # formats sorting in some cases)
 844             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 845                 f['tbr'] = f['abr'] + f['vbr']
 846
 847         def _formats_key(f):
 848             # TODO remove the following workaround
 849             from ..utils import determine_ext
 850             if not f.get('ext') and 'url' in f:
 851                 f['ext'] = determine_ext(f['url'])
 852
 853             if isinstance(field_preference, (list, tuple)):
 854                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 855
 856             preference = f.get('preference')
 857             if preference is None:
 858                 preference = 0
 859                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 860                     preference -= 0.5
 861
 862             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 863
 864             if f.get('vcodec') == 'none':  # audio only
 865                 if self._downloader.params.get('prefer_free_formats'):
 866                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 867                 else:
 868                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 869                 ext_preference = 0
 870                 try:
 871                     audio_ext_preference = ORDER.index(f['ext'])
 872                 except ValueError:
 873                     audio_ext_preference = -1
 874             else:
 875                 if self._downloader.params.get('prefer_free_formats'):
 876                     ORDER = ['flv', 'mp4', 'webm']
 877                 else:
 878                     ORDER = ['webm', 'flv', 'mp4']
 879                 try:
 880                     ext_preference = ORDER.index(f['ext'])
 881                 except ValueError:
 882                     ext_preference = -1
 883                 audio_ext_preference = 0
 884
 885             return (
 886                 preference,
 887                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 888                 f.get('quality') if f.get('quality') is not None else -1,
 889                 f.get('tbr') if f.get('tbr') is not None else -1,
 890                 f.get('filesize') if f.get('filesize') is not None else -1,
 891                 f.get('vbr') if f.get('vbr') is not None else -1,
 892                 f.get('height') if f.get('height') is not None else -1,
 893                 f.get('width') if f.get('width') is not None else -1,
 894                 proto_preference,
 895                 ext_preference,
 896                 f.get('abr') if f.get('abr') is not None else -1,
 897                 audio_ext_preference,
 898                 f.get('fps') if f.get('fps') is not None else -1,
 899                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 900                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 901                 f.get('format_id') if f.get('format_id') is not None else '',
 902             )
 903         formats.sort(key=_formats_key)
 904
 905     def _check_formats(self, formats, video_id):
 906         if formats:
 907             formats[:] = filter(
 908                 lambda f: self._is_valid_url(
 909                     f['url'], video_id,
 910                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 911                 formats)
 912
 913     @staticmethod
 914     def _remove_duplicate_formats(formats):
 915         format_urls = set()
 916         unique_formats = []
 917         for f in formats:
 918             if f['url'] not in format_urls:
 919                 format_urls.add(f['url'])
 920                 unique_formats.append(f)
 921         formats[:] = unique_formats
 922
 923     def _is_valid_url(self, url, video_id, item='video'):
 924         url = self._proto_relative_url(url, scheme='http:')
 925         # For now assume non HTTP(S) URLs always valid
 926         if not (url.startswith('http://') or url.startswith('https://')):
 927             return True
 928         try:
 929             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 930             return True
 931         except ExtractorError as e:
 932             if isinstance(e.cause, compat_urllib_error.URLError):
 933                 self.to_screen(
 934                     '%s: %s URL is invalid, skipping' % (video_id, item))
 935                 return False
 936             raise
 937
 938     def http_scheme(self):
 939         """ Either "http:" or "https:", depending on the user's preferences """
 940         return (
 941             'http:'
 942             if self._downloader.params.get('prefer_insecure', False)
 943             else 'https:')
 944
 945     def _proto_relative_url(self, url, scheme=None):
 946         if url is None:
 947             return url
 948         if url.startswith('//'):
 949             if scheme is None:
 950                 scheme = self.http_scheme()
 951             return scheme + url
 952         else:
 953             return url
 954
 955     def _sleep(self, timeout, video_id, msg_template=None):
 956         if msg_template is None:
 957             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 958         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 959         self.to_screen(msg)
 960         time.sleep(timeout)
 961
 962     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 963                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 964                              fatal=True):
 965         manifest = self._download_xml(
 966             manifest_url, video_id, 'Downloading f4m manifest',
 967             'Unable to download f4m manifest',
 968             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 969             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 970             transform_source=transform_source,
 971             fatal=fatal)
 972
 973         if manifest is False:
 974             return []
 975
 976         formats = []
 977         manifest_version = '1.0'
 978         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 979         if not media_nodes:
 980             manifest_version = '2.0'
 981             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 982         base_url = xpath_text(
 983             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 984             'base URL', default=None)
 985         if base_url:
 986             base_url = base_url.strip()
 987         for i, media_el in enumerate(media_nodes):
 988             if manifest_version == '2.0':
 989                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 990                 if not media_url:
 991                     continue
 992                 manifest_url = (
 993                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 994                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 995                 # If media_url is itself a f4m manifest do the recursive extraction
 996                 # since bitrates in parent manifest (this one) and media_url manifest
 997                 # may differ leading to inability to resolve the format by requested
 998                 # bitrate in f4m downloader
 999                 if determine_ext(manifest_url) == 'f4m':
1000                     formats.extend(self._extract_f4m_formats(
1001                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
1002                     continue
1003             tbr = int_or_none(media_el.attrib.get('bitrate'))
1004             formats.append({
1005                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1006                 'url': manifest_url,
1007                 'ext': 'flv',
1008                 'tbr': tbr,
1009                 'width': int_or_none(media_el.attrib.get('width')),
1010                 'height': int_or_none(media_el.attrib.get('height')),
1011                 'preference': preference,
1012             })
1013         self._sort_formats(formats)
1014
1015         return formats
1016
1017     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1018                               entry_protocol='m3u8', preference=None,
1019                               m3u8_id=None, note=None, errnote=None,
1020                               fatal=True):
1021
1022         formats = [{
1023             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1024             'url': m3u8_url,
1025             'ext': ext,
1026             'protocol': 'm3u8',
1027             'preference': preference - 1 if preference else -1,
1028             'resolution': 'multiple',
1029             'format_note': 'Quality selection URL',
1030         }]
1031
1032         format_url = lambda u: (
1033             u
1034             if re.match(r'^https?://', u)
1035             else compat_urlparse.urljoin(m3u8_url, u))
1036
1037         res = self._download_webpage_handle(
1038             m3u8_url, video_id,
1039             note=note or 'Downloading m3u8 information',
1040             errnote=errnote or 'Failed to download m3u8 information',
1041             fatal=fatal)
1042         if res is False:
1043             return []
1044         m3u8_doc, urlh = res
1045         m3u8_url = urlh.geturl()
1046
1047         # We should try extracting formats only from master playlists [1], i.e.
1048         # playlists that describe available qualities. On the other hand media
1049         # playlists [2] should be returned as is since they contain just the media
1050         # without qualities renditions.
1051         # Fortunately, master playlist can be easily distinguished from media
1052         # playlist based on particular tags availability. As of [1, 2] master
1053         # playlist tags MUST NOT appear in a media playist and vice versa.
1054         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1055         # and MUST NOT appear in master playlist thus we can clearly detect media
1056         # playlist with this criterion.
1057         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1058         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1059         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1060         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1061             return [{
1062                 'url': m3u8_url,
1063                 'format_id': m3u8_id,
1064                 'ext': ext,
1065                 'protocol': entry_protocol,
1066                 'preference': preference,
1067             }]
1068         last_info = None
1069         last_media = None
1070         kv_rex = re.compile(
1071             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1072         for line in m3u8_doc.splitlines():
1073             if line.startswith('#EXT-X-STREAM-INF:'):
1074                 last_info = {}
1075                 for m in kv_rex.finditer(line):
1076                     v = m.group('val')
1077                     if v.startswith('"'):
1078                         v = v[1:-1]
1079                     last_info[m.group('key')] = v
1080             elif line.startswith('#EXT-X-MEDIA:'):
1081                 last_media = {}
1082                 for m in kv_rex.finditer(line):
1083                     v = m.group('val')
1084                     if v.startswith('"'):
1085                         v = v[1:-1]
1086                     last_media[m.group('key')] = v
1087             elif line.startswith('#') or not line.strip():
1088                 continue
1089             else:
1090                 if last_info is None:
1091                     formats.append({'url': format_url(line)})
1092                     continue
1093                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1094                 format_id = []
1095                 if m3u8_id:
1096                     format_id.append(m3u8_id)
1097                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1098                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1099                 f = {
1100                     'format_id': '-'.join(format_id),
1101                     'url': format_url(line.strip()),
1102                     'tbr': tbr,
1103                     'ext': ext,
1104                     'protocol': entry_protocol,
1105                     'preference': preference,
1106                 }
1107                 resolution = last_info.get('RESOLUTION')
1108                 if resolution:
1109                     width_str, height_str = resolution.split('x')
1110                     f['width'] = int(width_str)
1111                     f['height'] = int(height_str)
1112                 codecs = last_info.get('CODECS')
1113                 if codecs:
1114                     vcodec, acodec = [None] * 2
1115                     va_codecs = codecs.split(',')
1116                     if len(va_codecs) == 1:
1117                         # Audio only entries usually come with single codec and
1118                         # no resolution. For more robustness we also check it to
1119                         # be mp4 audio.
1120                         if not resolution and va_codecs[0].startswith('mp4a'):
1121                             vcodec, acodec = 'none', va_codecs[0]
1122                         else:
1123                             vcodec = va_codecs[0]
1124                     else:
1125                         vcodec, acodec = va_codecs[:2]
1126                     f.update({
1127                         'acodec': acodec,
1128                         'vcodec': vcodec,
1129                     })
1130                 if last_media is not None:
1131                     f['m3u8_media'] = last_media
1132                     last_media = None
1133                 formats.append(f)
1134                 last_info = {}
1135         self._sort_formats(formats)
1136         return formats
1137
1138     @staticmethod
1139     def _xpath_ns(path, namespace=None):
1140         if not namespace:
1141             return path
1142         out = []
1143         for c in path.split('/'):
1144             if not c or c == '.':
1145                 out.append(c)
1146             else:
1147                 out.append('{%s}%s' % (namespace, c))
1148         return '/'.join(out)
1149
1150     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1151         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1152
1153         if smil is False:
1154             assert not fatal
1155             return []
1156
1157         namespace = self._parse_smil_namespace(smil)
1158
1159         return self._parse_smil_formats(
1160             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1161
1162     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1163         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1164         if smil is False:
1165             return {}
1166         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1167
1168     def _download_smil(self, smil_url, video_id, fatal=True):
1169         return self._download_xml(
1170             smil_url, video_id, 'Downloading SMIL file',
1171             'Unable to download SMIL file', fatal=fatal)
1172
1173     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1174         namespace = self._parse_smil_namespace(smil)
1175
1176         formats = self._parse_smil_formats(
1177             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1178         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1179
1180         video_id = os.path.splitext(url_basename(smil_url))[0]
1181         title = None
1182         description = None
1183         upload_date = None
1184         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1185             name = meta.attrib.get('name')
1186             content = meta.attrib.get('content')
1187             if not name or not content:
1188                 continue
1189             if not title and name == 'title':
1190                 title = content
1191             elif not description and name in ('description', 'abstract'):
1192                 description = content
1193             elif not upload_date and name == 'date':
1194                 upload_date = unified_strdate(content)
1195
1196         thumbnails = [{
1197             'id': image.get('type'),
1198             'url': image.get('src'),
1199             'width': int_or_none(image.get('width')),
1200             'height': int_or_none(image.get('height')),
1201         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1202
1203         return {
1204             'id': video_id,
1205             'title': title or video_id,
1206             'description': description,
1207             'upload_date': upload_date,
1208             'thumbnails': thumbnails,
1209             'formats': formats,
1210             'subtitles': subtitles,
1211         }
1212
1213     def _parse_smil_namespace(self, smil):
1214         return self._search_regex(
1215             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1216
1217     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1218         base = smil_url
1219         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1220             b = meta.get('base') or meta.get('httpBase')
1221             if b:
1222                 base = b
1223                 break
1224
1225         formats = []
1226         rtmp_count = 0
1227         http_count = 0
1228         m3u8_count = 0
1229
1230         srcs = []
1231         videos = smil.findall(self._xpath_ns('.//video', namespace))
1232         for video in videos:
1233             src = video.get('src')
1234             if not src or src in srcs:
1235                 continue
1236             srcs.append(src)
1237
1238             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1239             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1240             width = int_or_none(video.get('width'))
1241             height = int_or_none(video.get('height'))
1242             proto = video.get('proto')
1243             ext = video.get('ext')
1244             src_ext = determine_ext(src)
1245             streamer = video.get('streamer') or base
1246
1247             if proto == 'rtmp' or streamer.startswith('rtmp'):
1248                 rtmp_count += 1
1249                 formats.append({
1250                     'url': streamer,
1251                     'play_path': src,
1252                     'ext': 'flv',
1253                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1254                     'tbr': bitrate,
1255                     'filesize': filesize,
1256                     'width': width,
1257                     'height': height,
1258                 })
1259                 if transform_rtmp_url:
1260                     streamer, src = transform_rtmp_url(streamer, src)
1261                     formats[-1].update({
1262                         'url': streamer,
1263                         'play_path': src,
1264                     })
1265                 continue
1266
1267             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1268             src_url = src_url.strip()
1269
1270             if proto == 'm3u8' or src_ext == 'm3u8':
1271                 m3u8_formats = self._extract_m3u8_formats(
1272                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1273                 if len(m3u8_formats) == 1:
1274                     m3u8_count += 1
1275                     m3u8_formats[0].update({
1276                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1277                         'tbr': bitrate,
1278                         'width': width,
1279                         'height': height,
1280                     })
1281                 formats.extend(m3u8_formats)
1282                 continue
1283
1284             if src_ext == 'f4m':
1285                 f4m_url = src_url
1286                 if not f4m_params:
1287                     f4m_params = {
1288                         'hdcore': '3.2.0',
1289                         'plugin': 'flowplayer-3.2.0.1',
1290                     }
1291                 f4m_url += '&' if '?' in f4m_url else '?'
1292                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1293                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1294                 continue
1295
1296             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1297                 http_count += 1
1298                 formats.append({
1299                     'url': src_url,
1300                     'ext': ext or src_ext or 'flv',
1301                     'format_id': 'http-%d' % (bitrate or http_count),
1302                     'tbr': bitrate,
1303                     'filesize': filesize,
1304                     'width': width,
1305                     'height': height,
1306                 })
1307                 continue
1308
1309         self._sort_formats(formats)
1310
1311         return formats
1312
1313     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1314         urls = []
1315         subtitles = {}
1316         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1317             src = textstream.get('src')
1318             if not src or src in urls:
1319                 continue
1320             urls.append(src)
1321             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1322             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1323             subtitles.setdefault(lang, []).append({
1324                 'url': src,
1325                 'ext': ext,
1326             })
1327         return subtitles
1328
1329     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1330         xspf = self._download_xml(
1331             playlist_url, playlist_id, 'Downloading xpsf playlist',
1332             'Unable to download xspf manifest', fatal=fatal)
1333         if xspf is False:
1334             return []
1335         return self._parse_xspf(xspf, playlist_id)
1336
1337     def _parse_xspf(self, playlist, playlist_id):
1338         NS_MAP = {
1339             'xspf': 'http://xspf.org/ns/0/',
1340             's1': 'http://static.streamone.nl/player/ns/0',
1341         }
1342
1343         entries = []
1344         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1345             title = xpath_text(
1346                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1347             description = xpath_text(
1348                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1349             thumbnail = xpath_text(
1350                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1351             duration = float_or_none(
1352                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1353
1354             formats = [{
1355                 'url': location.text,
1356                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1357                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1358                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1359             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1360             self._sort_formats(formats)
1361
1362             entries.append({
1363                 'id': playlist_id,
1364                 'title': title,
1365                 'description': description,
1366                 'thumbnail': thumbnail,
1367                 'duration': duration,
1368                 'formats': formats,
1369             })
1370         return entries
1371
1372     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1373         res = self._download_webpage_handle(
1374             mpd_url, video_id,
1375             note=note or 'Downloading MPD manifest',
1376             errnote=errnote or 'Failed to download MPD manifest',
1377             fatal=fatal)
1378         if res is False:
1379             return []
1380         mpd, urlh = res
1381         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1382
1383         return self._parse_mpd_formats(
1384             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1385
1386     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1387         if mpd_doc.get('type') == 'dynamic':
1388             return []
1389
1390         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1391
1392         def _add_ns(path):
1393             return self._xpath_ns(path, namespace)
1394
1395         def is_drm_protected(element):
1396             return element.find(_add_ns('ContentProtection')) is not None
1397
1398         def extract_multisegment_info(element, ms_parent_info):
1399             ms_info = ms_parent_info.copy()
1400             segment_list = element.find(_add_ns('SegmentList'))
1401             if segment_list is not None:
1402                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1403                 if segment_urls_e:
1404                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1405                 initialization = segment_list.find(_add_ns('Initialization'))
1406                 if initialization is not None:
1407                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1408             else:
1409                 segment_template = element.find(_add_ns('SegmentTemplate'))
1410                 if segment_template is not None:
1411                     start_number = segment_template.get('startNumber')
1412                     if start_number:
1413                         ms_info['start_number'] = int(start_number)
1414                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1415                     if segment_timeline is not None:
1416                         s_e = segment_timeline.findall(_add_ns('S'))
1417                         if s_e:
1418                             ms_info['total_number'] = 0
1419                             for s in s_e:
1420                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1421                     else:
1422                         timescale = segment_template.get('timescale')
1423                         if timescale:
1424                             ms_info['timescale'] = int(timescale)
1425                         segment_duration = segment_template.get('duration')
1426                         if segment_duration:
1427                             ms_info['segment_duration'] = int(segment_duration)
1428                     media_template = segment_template.get('media')
1429                     if media_template:
1430                         ms_info['media_template'] = media_template
1431                     initialization = segment_template.get('initialization')
1432                     if initialization:
1433                         ms_info['initialization_url'] = initialization
1434                     else:
1435                         initialization = segment_template.find(_add_ns('Initialization'))
1436                         if initialization is not None:
1437                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1438             return ms_info
1439
1440         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1441         formats = []
1442         for period in mpd_doc.findall(_add_ns('Period')):
1443             period_duration = parse_duration(period.get('duration')) or mpd_duration
1444             period_ms_info = extract_multisegment_info(period, {
1445                 'start_number': 1,
1446                 'timescale': 1,
1447             })
1448             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1449                 if is_drm_protected(adaptation_set):
1450                     continue
1451                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1452                 for representation in adaptation_set.findall(_add_ns('Representation')):
1453                     if is_drm_protected(representation):
1454                         continue
1455                     representation_attrib = adaptation_set.attrib.copy()
1456                     representation_attrib.update(representation.attrib)
1457                     mime_type = representation_attrib.get('mimeType')
1458                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1459                     if content_type == 'text':
1460                         # TODO implement WebVTT downloading
1461                         pass
1462                     elif content_type == 'video' or content_type == 'audio':
1463                         base_url = ''
1464                         for element in (representation, adaptation_set, period, mpd_doc):
1465                             base_url_e = element.find(_add_ns('BaseURL'))
1466                             if base_url_e is not None:
1467                                 base_url = base_url_e.text + base_url
1468                                 if re.match(r'^https?://', base_url):
1469                                     break
1470                         if mpd_base_url and not re.match(r'^https?://', base_url):
1471                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1472                                 mpd_base_url += '/'
1473                             base_url = mpd_base_url + base_url
1474                         representation_id = representation_attrib.get('id')
1475                         lang = representation_attrib.get('lang')
1476                         url_el = representation.find(_add_ns('BaseURL'))
1477                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1478                         f = {
1479                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1480                             'url': base_url,
1481                             'width': int_or_none(representation_attrib.get('width')),
1482                             'height': int_or_none(representation_attrib.get('height')),
1483                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1484                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1485                             'fps': int_or_none(representation_attrib.get('frameRate')),
1486                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1487                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1488                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1489                             'format_note': 'DASH %s' % content_type,
1490                             'filesize': filesize,
1491                         }
1492                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1493                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1494                             if 'total_number' not in representation_ms_info and 'segment_duration':
1495                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1496                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1497                             media_template = representation_ms_info['media_template']
1498                             media_template = media_template.replace('$RepresentationID$', representation_id)
1499                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1500                             media_template.replace('$$', '$')
1501                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1502                         if 'segment_urls' in representation_ms_info:
1503                             f.update({
1504                                 'segment_urls': representation_ms_info['segment_urls'],
1505                                 'protocol': 'http_dash_segments',
1506                             })
1507                             if 'initialization_url' in representation_ms_info:
1508                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1509                                 f.update({
1510                                     'initialization_url': initialization_url,
1511                                 })
1512                                 if not f.get('url'):
1513                                     f['url'] = initialization_url
1514                         try:
1515                             existing_format = next(
1516                                 fo for fo in formats
1517                                 if fo['format_id'] == representation_id)
1518                         except StopIteration:
1519                             full_info = formats_dict.get(representation_id, {}).copy()
1520                             full_info.update(f)
1521                             formats.append(full_info)
1522                         else:
1523                             existing_format.update(f)
1524                     else:
1525                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1526         self._sort_formats(formats)
1527         return formats
1528
1529     def _live_title(self, name):
1530         """ Generate the title for a live video """
1531         now = datetime.datetime.now()
1532         now_str = now.strftime('%Y-%m-%d %H:%M')
1533         return name + ' ' + now_str
1534
1535     def _int(self, v, name, fatal=False, **kwargs):
1536         res = int_or_none(v, **kwargs)
1537         if 'get_attr' in kwargs:
1538             print(getattr(v, kwargs['get_attr']))
1539         if res is None:
1540             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1541             if fatal:
1542                 raise ExtractorError(msg)
1543             else:
1544                 self._downloader.report_warning(msg)
1545         return res
1546
1547     def _float(self, v, name, fatal=False, **kwargs):
1548         res = float_or_none(v, **kwargs)
1549         if res is None:
1550             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1551             if fatal:
1552                 raise ExtractorError(msg)
1553             else:
1554                 self._downloader.report_warning(msg)
1555         return res
1556
1557     def _set_cookie(self, domain, name, value, expire_time=None):
1558         cookie = compat_cookiejar.Cookie(
1559             0, name, value, None, None, domain, None,
1560             None, '/', True, False, expire_time, '', None, None, None)
1561         self._downloader.cookiejar.set_cookie(cookie)
1562
1563     def _get_cookies(self, url):
1564         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1565         req = sanitized_Request(url)
1566         self._downloader.cookiejar.add_cookie_header(req)
1567         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1568
1569     def get_testcases(self, include_onlymatching=False):
1570         t = getattr(self, '_TEST', None)
1571         if t:
1572             assert not hasattr(self, '_TESTS'), \
1573                 '%s has _TEST and _TESTS' % type(self).__name__
1574             tests = [t]
1575         else:
1576             tests = getattr(self, '_TESTS', [])
1577         for t in tests:
1578             if not include_onlymatching and t.get('only_matching', False):
1579                 continue
1580             t['name'] = type(self).__name__[:-len('IE')]
1581             yield t
1582
1583     def is_suitable(self, age_limit):
1584         """ Test whether the extractor is generally suitable for the given
1585         age limit (i.e. pornographic sites are not, all others usually are) """
1586
1587         any_restricted = False
1588         for tc in self.get_testcases(include_onlymatching=False):
1589             if 'playlist' in tc:
1590                 tc = tc['playlist'][0]
1591             is_restricted = age_restricted(
1592                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1593             if not is_restricted:
1594                 return True
1595             any_restricted = any_restricted or is_restricted
1596         return not any_restricted
1597
1598     def extract_subtitles(self, *args, **kwargs):
1599         if (self._downloader.params.get('writesubtitles', False) or
1600                 self._downloader.params.get('listsubtitles')):
1601             return self._get_subtitles(*args, **kwargs)
1602         return {}
1603
1604     def _get_subtitles(self, *args, **kwargs):
1605         raise NotImplementedError('This method must be implemented by subclasses')
1606
1607     @staticmethod
1608     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1609         """ Merge subtitle items for one language. Items with duplicated URLs
1610         will be dropped. """
1611         list1_urls = set([item['url'] for item in subtitle_list1])
1612         ret = list(subtitle_list1)
1613         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1614         return ret
1615
1616     @classmethod
1617     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1618         """ Merge two subtitle dictionaries, language by language. """
1619         ret = dict(subtitle_dict1)
1620         for lang in subtitle_dict2:
1621             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1622         return ret
1623
1624     def extract_automatic_captions(self, *args, **kwargs):
1625         if (self._downloader.params.get('writeautomaticsub', False) or
1626                 self._downloader.params.get('listsubtitles')):
1627             return self._get_automatic_captions(*args, **kwargs)
1628         return {}
1629
1630     def _get_automatic_captions(self, *args, **kwargs):
1631         raise NotImplementedError('This method must be implemented by subclasses')
1632
1633     def mark_watched(self, *args, **kwargs):
1634         if (self._downloader.params.get('mark_watched', False) and
1635                 (self._get_login_info()[0] is not None or
1636                     self._downloader.params.get('cookiefile') is not None)):
1637             self._mark_watched(*args, **kwargs)
1638
1639     def _mark_watched(self, *args, **kwargs):
1640         raise NotImplementedError('This method must be implemented by subclasses')
1641
1642
1643 class SearchInfoExtractor(InfoExtractor):
1644     """
1645     Base class for paged search queries extractors.
1646     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1647     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1648     """
1649
1650     @classmethod
1651     def _make_valid_url(cls):
1652         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1653
1654     @classmethod
1655     def suitable(cls, url):
1656         return re.match(cls._make_valid_url(), url) is not None
1657
1658     def _real_extract(self, query):
1659         mobj = re.match(self._make_valid_url(), query)
1660         if mobj is None:
1661             raise ExtractorError('Invalid search query "%s"' % query)
1662
1663         prefix = mobj.group('prefix')
1664         query = mobj.group('query')
1665         if prefix == '':
1666             return self._get_n_results(query, 1)
1667         elif prefix == 'all':
1668             return self._get_n_results(query, self._MAX_RESULTS)
1669         else:
1670             n = int(prefix)
1671             if n <= 0:
1672                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1673             elif n > self._MAX_RESULTS:
1674                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1675                 n = self._MAX_RESULTS
1676             return self._get_n_results(query, n)
1677
1678     def _get_n_results(self, query, n):
1679         """Get a specified number of results for a query"""
1680         raise NotImplementedError('This method must be implemented by subclasses')
1681
1682     @property
1683     def SEARCH_KEY(self):
1684         return self._SEARCH_KEY