_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49     mimetype2ext,
  50 )
  51
  52
  53 class InfoExtractor(object):
  54     """Information Extractor class.
  55
  56     Information extractors are the classes that, given a URL, extract
  57     information about the video (or videos) the URL refers to. This
  58     information includes the real video URL, the video title, author and
  59     others. The information is stored in a dictionary which is then
  60     passed to the YoutubeDL. The YoutubeDL processes this
  61     information possibly downloading the video to the file system, among
  62     other possible outcomes.
  63
  64     The type field determines the type of the result.
  65     By far the most common value (and the default if _type is missing) is
  66     "video", which indicates a single video.
  67
  68     For a video, the dictionaries must include the following fields:
  69
  70     id:             Video identifier.
  71     title:          Video title, unescaped.
  72
  73     Additionally, it must contain either a formats entry or a url one:
  74
  75     formats:        A list of dictionaries for each format available, ordered
  76                     from worst to best quality.
  77
  78                     Potential fields:
  79                     * url        Mandatory. The URL of the video file
  80                     * ext        Will be calculated from URL if missing
  81                     * format     A human-readable description of the format
  82                                  ("mp4 container with h264/opus").
  83                                  Calculated from the format_id, width, height.
  84                                  and format_note fields if missing.
  85                     * format_id  A short description of the format
  86                                  ("mp4_h264_opus" or "19").
  87                                 Technically optional, but strongly recommended.
  88                     * format_note Additional info about the format
  89                                  ("3D" or "DASH video")
  90                     * width      Width of the video, if known
  91                     * height     Height of the video, if known
  92                     * resolution Textual description of width and height
  93                     * tbr        Average bitrate of audio and video in KBit/s
  94                     * abr        Average audio bitrate in KBit/s
  95                     * acodec     Name of the audio codec in use
  96                     * asr        Audio sampling rate in Hertz
  97                     * vbr        Average video bitrate in KBit/s
  98                     * fps        Frame rate
  99                     * vcodec     Name of the video codec in use
 100                     * container  Name of the container format
 101                     * filesize   The number of bytes, if known in advance
 102                     * filesize_approx  An estimate for the number of bytes
 103                     * player_url SWF Player URL (used for rtmpdump).
 104                     * protocol   The protocol that will be used for the actual
 105                                  download, lower-case.
 106                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 107                                  "m3u8", or "m3u8_native".
 108                     * preference Order number of this format. If this field is
 109                                  present and not None, the formats get sorted
 110                                  by this field, regardless of all other values.
 111                                  -1 for default (order by other properties),
 112                                  -2 or smaller for less than default.
 113                                  < -1000 to hide the format (if there is
 114                                     another one which is strictly better)
 115                     * language   Language code, e.g. "de" or "en-US".
 116                     * language_preference  Is this in the language mentioned in
 117                                  the URL?
 118                                  10 if it's what the URL is about,
 119                                  -1 for default (don't know),
 120                                  -10 otherwise, other values reserved for now.
 121                     * quality    Order number of the video quality of this
 122                                  format, irrespective of the file format.
 123                                  -1 for default (order by other properties),
 124                                  -2 or smaller for less than default.
 125                     * source_preference  Order number for this video source
 126                                   (quality takes higher priority)
 127                                  -1 for default (order by other properties),
 128                                  -2 or smaller for less than default.
 129                     * http_headers  A dictionary of additional HTTP headers
 130                                  to add to the request.
 131                     * stretched_ratio  If given and not 1, indicates that the
 132                                  video's pixels are not square.
 133                                  width : height ratio as float.
 134                     * no_resume  The server does not support resuming the
 135                                  (HTTP or RTMP) download. Boolean.
 136
 137     url:            Final video URL.
 138     ext:            Video filename extension.
 139     format:         The video format, defaults to ext (used for --get-format)
 140     player_url:     SWF Player URL (used for rtmpdump).
 141
 142     The following fields are optional:
 143
 144     alt_title:      A secondary title of the video.
 145     display_id      An alternative identifier for the video, not necessarily
 146                     unique, but available before title. Typically, id is
 147                     something like "4234987", title "Dancing naked mole rats",
 148                     and display_id "dancing-naked-mole-rats"
 149     thumbnails:     A list of dictionaries, with the following entries:
 150                         * "id" (optional, string) - Thumbnail format ID
 151                         * "url"
 152                         * "preference" (optional, int) - quality of the image
 153                         * "width" (optional, int)
 154                         * "height" (optional, int)
 155                         * "resolution" (optional, string "{width}x{height"},
 156                                         deprecated)
 157     thumbnail:      Full URL to a video thumbnail image.
 158     description:    Full video description.
 159     uploader:       Full name of the video uploader.
 160     license:        License name the video is licensed under.
 161     creator:        The main artist who created the video.
 162     release_date:   The date (YYYYMMDD) when the video was released.
 163     timestamp:      UNIX timestamp of the moment the video became available.
 164     upload_date:    Video upload date (YYYYMMDD).
 165                     If not explicitly set, calculated from timestamp.
 166     uploader_id:    Nickname or id of the video uploader.
 167     uploader_url:   Full URL to a personal webpage of the video uploader.
 168     location:       Physical location where the video was filmed.
 169     subtitles:      The available subtitles as a dictionary in the format
 170                     {language: subformats}. "subformats" is a list sorted from
 171                     lower to higher preference, each element is a dictionary
 172                     with the "ext" entry and one of:
 173                         * "data": The subtitles file contents
 174                         * "url": A URL pointing to the subtitles file
 175                     "ext" will be calculated from URL if missing
 176     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 177                     automatically generated captions
 178     duration:       Length of the video in seconds, as an integer or float.
 179     view_count:     How many users have watched the video on the platform.
 180     like_count:     Number of positive ratings of the video
 181     dislike_count:  Number of negative ratings of the video
 182     repost_count:   Number of reposts of the video
 183     average_rating: Average rating give by users, the scale used depends on the webpage
 184     comment_count:  Number of comments on the video
 185     comments:       A list of comments, each with one or more of the following
 186                     properties (all but one of text or html optional):
 187                         * "author" - human-readable name of the comment author
 188                         * "author_id" - user ID of the comment author
 189                         * "id" - Comment ID
 190                         * "html" - Comment as HTML
 191                         * "text" - Plain text of the comment
 192                         * "timestamp" - UNIX timestamp of comment
 193                         * "parent" - ID of the comment this one is replying to.
 194                                      Set to "root" to indicate that this is a
 195                                      comment to the original video.
 196     age_limit:      Age restriction for the video, as an integer (years)
 197     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 198                     should allow to get the same result again. (It will be set
 199                     by YoutubeDL if it's missing)
 200     categories:     A list of categories that the video falls in, for example
 201                     ["Sports", "Berlin"]
 202     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 203     is_live:        True, False, or None (=unknown). Whether this video is a
 204                     live stream that goes on instead of a fixed-length video.
 205     start_time:     Time in seconds where the reproduction should start, as
 206                     specified in the URL.
 207     end_time:       Time in seconds where the reproduction should end, as
 208                     specified in the URL.
 209
 210     The following fields should only be used when the video belongs to some logical
 211     chapter or section:
 212
 213     chapter:        Name or title of the chapter the video belongs to.
 214     chapter_number: Number of the chapter the video belongs to, as an integer.
 215     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 216
 217     The following fields should only be used when the video is an episode of some
 218     series or programme:
 219
 220     series:         Title of the series or programme the video episode belongs to.
 221     season:         Title of the season the video episode belongs to.
 222     season_number:  Number of the season the video episode belongs to, as an integer.
 223     season_id:      Id of the season the video episode belongs to, as a unicode string.
 224     episode:        Title of the video episode. Unlike mandatory video title field,
 225                     this field should denote the exact title of the video episode
 226                     without any kind of decoration.
 227     episode_number: Number of the video episode within a season, as an integer.
 228     episode_id:     Id of the video episode, as a unicode string.
 229
 230     Unless mentioned otherwise, the fields should be Unicode strings.
 231
 232     Unless mentioned otherwise, None is equivalent to absence of information.
 233
 234
 235     _type "playlist" indicates multiple videos.
 236     There must be a key "entries", which is a list, an iterable, or a PagedList
 237     object, each element of which is a valid dictionary by this specification.
 238
 239     Additionally, playlists can have "title", "description" and "id" attributes
 240     with the same semantics as videos (see above).
 241
 242
 243     _type "multi_video" indicates that there are multiple videos that
 244     form a single show, for examples multiple acts of an opera or TV episode.
 245     It must have an entries key like a playlist and contain all the keys
 246     required for a video at the same time.
 247
 248
 249     _type "url" indicates that the video must be extracted from another
 250     location, possibly by a different extractor. Its only required key is:
 251     "url" - the next URL to extract.
 252     The key "ie_key" can be set to the class name (minus the trailing "IE",
 253     e.g. "Youtube") if the extractor class is known in advance.
 254     Additionally, the dictionary may have any properties of the resolved entity
 255     known in advance, for example "title" if the title of the referred video is
 256     known ahead of time.
 257
 258
 259     _type "url_transparent" entities have the same specification as "url", but
 260     indicate that the given additional information is more precise than the one
 261     associated with the resolved URL.
 262     This is useful when a site employs a video service that hosts the video and
 263     its technical metadata, but that video service does not embed a useful
 264     title, description etc.
 265
 266
 267     Subclasses of this one should re-define the _real_initialize() and
 268     _real_extract() methods and define a _VALID_URL regexp.
 269     Probably, they should also be added to the list of extractors.
 270
 271     Finally, the _WORKING attribute should be set to False for broken IEs
 272     in order to warn the users and skip the tests.
 273     """
 274
 275     _ready = False
 276     _downloader = None
 277     _WORKING = True
 278
 279     def __init__(self, downloader=None):
 280         """Constructor. Receives an optional downloader."""
 281         self._ready = False
 282         self.set_downloader(downloader)
 283
 284     @classmethod
 285     def suitable(cls, url):
 286         """Receives a URL and returns True if suitable for this IE."""
 287
 288         # This does not use has/getattr intentionally - we want to know whether
 289         # we have cached the regexp for *this* class, whereas getattr would also
 290         # match the superclass
 291         if '_VALID_URL_RE' not in cls.__dict__:
 292             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 293         return cls._VALID_URL_RE.match(url) is not None
 294
 295     @classmethod
 296     def _match_id(cls, url):
 297         if '_VALID_URL_RE' not in cls.__dict__:
 298             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 299         m = cls._VALID_URL_RE.match(url)
 300         assert m
 301         return m.group('id')
 302
 303     @classmethod
 304     def working(cls):
 305         """Getter method for _WORKING."""
 306         return cls._WORKING
 307
 308     def initialize(self):
 309         """Initializes an instance (authentication, etc)."""
 310         if not self._ready:
 311             self._real_initialize()
 312             self._ready = True
 313
 314     def extract(self, url):
 315         """Extracts URL information and returns it in list of dicts."""
 316         try:
 317             self.initialize()
 318             return self._real_extract(url)
 319         except ExtractorError:
 320             raise
 321         except compat_http_client.IncompleteRead as e:
 322             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 323         except (KeyError, StopIteration) as e:
 324             raise ExtractorError('An extractor error has occurred.', cause=e)
 325
 326     def set_downloader(self, downloader):
 327         """Sets the downloader for this IE."""
 328         self._downloader = downloader
 329
 330     def _real_initialize(self):
 331         """Real initialization process. Redefine in subclasses."""
 332         pass
 333
 334     def _real_extract(self, url):
 335         """Real extraction process. Redefine in subclasses."""
 336         pass
 337
 338     @classmethod
 339     def ie_key(cls):
 340         """A string for getting the InfoExtractor with get_info_extractor"""
 341         return compat_str(cls.__name__[:-2])
 342
 343     @property
 344     def IE_NAME(self):
 345         return compat_str(type(self).__name__[:-2])
 346
 347     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 348         """ Returns the response handle """
 349         if note is None:
 350             self.report_download_webpage(video_id)
 351         elif note is not False:
 352             if video_id is None:
 353                 self.to_screen('%s' % (note,))
 354             else:
 355                 self.to_screen('%s: %s' % (video_id, note))
 356         try:
 357             return self._downloader.urlopen(url_or_request)
 358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 359             if errnote is False:
 360                 return False
 361             if errnote is None:
 362                 errnote = 'Unable to download webpage'
 363
 364             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 365             if fatal:
 366                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 367             else:
 368                 self._downloader.report_warning(errmsg)
 369                 return False
 370
 371     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 372         """ Returns a tuple (page content as string, URL handle) """
 373         # Strip hashes from the URL (#1038)
 374         if isinstance(url_or_request, (compat_str, str)):
 375             url_or_request = url_or_request.partition('#')[0]
 376
 377         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 378         if urlh is False:
 379             assert not fatal
 380             return False
 381         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 382         return (content, urlh)
 383
 384     @staticmethod
 385     def _guess_encoding_from_content(content_type, webpage_bytes):
 386         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 387         if m:
 388             encoding = m.group(1)
 389         else:
 390             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 391                           webpage_bytes[:1024])
 392             if m:
 393                 encoding = m.group(1).decode('ascii')
 394             elif webpage_bytes.startswith(b'\xff\xfe'):
 395                 encoding = 'utf-16'
 396             else:
 397                 encoding = 'utf-8'
 398
 399         return encoding
 400
 401     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 402         content_type = urlh.headers.get('Content-Type', '')
 403         webpage_bytes = urlh.read()
 404         if prefix is not None:
 405             webpage_bytes = prefix + webpage_bytes
 406         if not encoding:
 407             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 408         if self._downloader.params.get('dump_intermediate_pages', False):
 409             try:
 410                 url = url_or_request.get_full_url()
 411             except AttributeError:
 412                 url = url_or_request
 413             self.to_screen('Dumping request to ' + url)
 414             dump = base64.b64encode(webpage_bytes).decode('ascii')
 415             self._downloader.to_screen(dump)
 416         if self._downloader.params.get('write_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             basen = '%s_%s' % (video_id, url)
 422             if len(basen) > 240:
 423                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 424                 basen = basen[:240 - len(h)] + h
 425             raw_filename = basen + '.dump'
 426             filename = sanitize_filename(raw_filename, restricted=True)
 427             self.to_screen('Saving request to ' + filename)
 428             # Working around MAX_PATH limitation on Windows (see
 429             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 430             if os.name == 'nt':
 431                 absfilepath = os.path.abspath(filename)
 432                 if len(absfilepath) > 259:
 433                     filename = '\\\\?\\' + absfilepath
 434             with open(filename, 'wb') as outf:
 435                 outf.write(webpage_bytes)
 436
 437         try:
 438             content = webpage_bytes.decode(encoding, 'replace')
 439         except LookupError:
 440             content = webpage_bytes.decode('utf-8', 'replace')
 441
 442         if ('<title>Access to this site is blocked</title>' in content and
 443                 'Websense' in content[:512]):
 444             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 445             blocked_iframe = self._html_search_regex(
 446                 r'<iframe src="([^"]+)"', content,
 447                 'Websense information URL', default=None)
 448             if blocked_iframe:
 449                 msg += ' Visit %s for more details' % blocked_iframe
 450             raise ExtractorError(msg, expected=True)
 451         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 452             msg = (
 453                 'Access to this webpage has been blocked by Indian censorship. '
 454                 'Use a VPN or proxy server (with --proxy) to route around it.')
 455             block_msg = self._html_search_regex(
 456                 r'</h1><p>(.*?)</p>',
 457                 content, 'block message', default=None)
 458             if block_msg:
 459                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 460             raise ExtractorError(msg, expected=True)
 461
 462         return content
 463
 464     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 465         """ Returns the data of the page as a string """
 466         success = False
 467         try_count = 0
 468         while success is False:
 469             try:
 470                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 471                 success = True
 472             except compat_http_client.IncompleteRead as e:
 473                 try_count += 1
 474                 if try_count >= tries:
 475                     raise e
 476                 self._sleep(timeout, video_id)
 477         if res is False:
 478             return res
 479         else:
 480             content, _ = res
 481             return content
 482
 483     def _download_xml(self, url_or_request, video_id,
 484                       note='Downloading XML', errnote='Unable to download XML',
 485                       transform_source=None, fatal=True, encoding=None):
 486         """Return the xml as an xml.etree.ElementTree.Element"""
 487         xml_string = self._download_webpage(
 488             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 489         if xml_string is False:
 490             return xml_string
 491         if transform_source:
 492             xml_string = transform_source(xml_string)
 493         return compat_etree_fromstring(xml_string.encode('utf-8'))
 494
 495     def _download_json(self, url_or_request, video_id,
 496                        note='Downloading JSON metadata',
 497                        errnote='Unable to download JSON metadata',
 498                        transform_source=None,
 499                        fatal=True, encoding=None):
 500         json_string = self._download_webpage(
 501             url_or_request, video_id, note, errnote, fatal=fatal,
 502             encoding=encoding)
 503         if (not fatal) and json_string is False:
 504             return None
 505         return self._parse_json(
 506             json_string, video_id, transform_source=transform_source, fatal=fatal)
 507
 508     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 509         if transform_source:
 510             json_string = transform_source(json_string)
 511         try:
 512             return json.loads(json_string)
 513         except ValueError as ve:
 514             errmsg = '%s: Failed to parse JSON ' % video_id
 515             if fatal:
 516                 raise ExtractorError(errmsg, cause=ve)
 517             else:
 518                 self.report_warning(errmsg + str(ve))
 519
 520     def update_url_params(self, url, params):
 521         parsed_url = compat_urlparse.urlparse(url)
 522         qs = compat_urlparse.parse_qs(parsed_url.query)
 523         qs.update(params)
 524         return compat_urlparse.urlunparse(
 525             parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
 526
 527     def report_warning(self, msg, video_id=None):
 528         idstr = '' if video_id is None else '%s: ' % video_id
 529         self._downloader.report_warning(
 530             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 531
 532     def to_screen(self, msg):
 533         """Print msg to screen, prefixing it with '[ie_name]'"""
 534         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 535
 536     def report_extraction(self, id_or_name):
 537         """Report information extraction."""
 538         self.to_screen('%s: Extracting information' % id_or_name)
 539
 540     def report_download_webpage(self, video_id):
 541         """Report webpage download."""
 542         self.to_screen('%s: Downloading webpage' % video_id)
 543
 544     def report_age_confirmation(self):
 545         """Report attempt to confirm age."""
 546         self.to_screen('Confirming age')
 547
 548     def report_login(self):
 549         """Report attempt to log in."""
 550         self.to_screen('Logging in')
 551
 552     @staticmethod
 553     def raise_login_required(msg='This video is only available for registered users'):
 554         raise ExtractorError(
 555             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 556             expected=True)
 557
 558     @staticmethod
 559     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 560         raise ExtractorError(
 561             '%s. You might want to use --proxy to workaround.' % msg,
 562             expected=True)
 563
 564     # Methods for following #608
 565     @staticmethod
 566     def url_result(url, ie=None, video_id=None, video_title=None):
 567         """Returns a URL that points to a page that should be processed"""
 568         # TODO: ie should be the class used for getting the info
 569         video_info = {'_type': 'url',
 570                       'url': url,
 571                       'ie_key': ie}
 572         if video_id is not None:
 573             video_info['id'] = video_id
 574         if video_title is not None:
 575             video_info['title'] = video_title
 576         return video_info
 577
 578     @staticmethod
 579     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 580         """Returns a playlist"""
 581         video_info = {'_type': 'playlist',
 582                       'entries': entries}
 583         if playlist_id:
 584             video_info['id'] = playlist_id
 585         if playlist_title:
 586             video_info['title'] = playlist_title
 587         if playlist_description:
 588             video_info['description'] = playlist_description
 589         return video_info
 590
 591     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 592         """
 593         Perform a regex search on the given string, using a single or a list of
 594         patterns returning the first matching group.
 595         In case of failure return a default value or raise a WARNING or a
 596         RegexNotFoundError, depending on fatal, specifying the field name.
 597         """
 598         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 599             mobj = re.search(pattern, string, flags)
 600         else:
 601             for p in pattern:
 602                 mobj = re.search(p, string, flags)
 603                 if mobj:
 604                     break
 605
 606         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 607             _name = '\033[0;34m%s\033[0m' % name
 608         else:
 609             _name = name
 610
 611         if mobj:
 612             if group is None:
 613                 # return the first matching group
 614                 return next(g for g in mobj.groups() if g is not None)
 615             else:
 616                 return mobj.group(group)
 617         elif default is not NO_DEFAULT:
 618             return default
 619         elif fatal:
 620             raise RegexNotFoundError('Unable to extract %s' % _name)
 621         else:
 622             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 623             return None
 624
 625     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 626         """
 627         Like _search_regex, but strips HTML tags and unescapes entities.
 628         """
 629         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 630         if res:
 631             return clean_html(res).strip()
 632         else:
 633             return res
 634
 635     def _get_login_info(self):
 636         """
 637         Get the login info as (username, password)
 638         It will look in the netrc file using the _NETRC_MACHINE value
 639         If there's no info available, return (None, None)
 640         """
 641         if self._downloader is None:
 642             return (None, None)
 643
 644         username = None
 645         password = None
 646         downloader_params = self._downloader.params
 647
 648         # Attempt to use provided username and password or .netrc data
 649         if downloader_params.get('username') is not None:
 650             username = downloader_params['username']
 651             password = downloader_params['password']
 652         elif downloader_params.get('usenetrc', False):
 653             try:
 654                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 655                 if info is not None:
 656                     username = info[0]
 657                     password = info[2]
 658                 else:
 659                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 660             except (IOError, netrc.NetrcParseError) as err:
 661                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 662
 663         return (username, password)
 664
 665     def _get_tfa_info(self, note='two-factor verification code'):
 666         """
 667         Get the two-factor authentication info
 668         TODO - asking the user will be required for sms/phone verify
 669         currently just uses the command line option
 670         If there's no info available, return None
 671         """
 672         if self._downloader is None:
 673             return None
 674         downloader_params = self._downloader.params
 675
 676         if downloader_params.get('twofactor') is not None:
 677             return downloader_params['twofactor']
 678
 679         return compat_getpass('Type %s and press [Return]: ' % note)
 680
 681     # Helper functions for extracting OpenGraph info
 682     @staticmethod
 683     def _og_regexes(prop):
 684         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 685         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 686                        % {'prop': re.escape(prop)})
 687         template = r'<meta[^>]+?%s[^>]+?%s'
 688         return [
 689             template % (property_re, content_re),
 690             template % (content_re, property_re),
 691         ]
 692
 693     @staticmethod
 694     def _meta_regex(prop):
 695         return r'''(?isx)<meta
 696                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 697                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 698
 699     def _og_search_property(self, prop, html, name=None, **kargs):
 700         if name is None:
 701             name = 'OpenGraph %s' % prop
 702         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 703         if escaped is None:
 704             return None
 705         return unescapeHTML(escaped)
 706
 707     def _og_search_thumbnail(self, html, **kargs):
 708         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 709
 710     def _og_search_description(self, html, **kargs):
 711         return self._og_search_property('description', html, fatal=False, **kargs)
 712
 713     def _og_search_title(self, html, **kargs):
 714         return self._og_search_property('title', html, **kargs)
 715
 716     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 717         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 718         if secure:
 719             regexes = self._og_regexes('video:secure_url') + regexes
 720         return self._html_search_regex(regexes, html, name, **kargs)
 721
 722     def _og_search_url(self, html, **kargs):
 723         return self._og_search_property('url', html, **kargs)
 724
 725     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 726         if display_name is None:
 727             display_name = name
 728         return self._html_search_regex(
 729             self._meta_regex(name),
 730             html, display_name, fatal=fatal, group='content', **kwargs)
 731
 732     def _dc_search_uploader(self, html):
 733         return self._html_search_meta('dc.creator', html, 'uploader')
 734
 735     def _rta_search(self, html):
 736         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 737         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 738                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 739                      html):
 740             return 18
 741         return 0
 742
 743     def _media_rating_search(self, html):
 744         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 745         rating = self._html_search_meta('rating', html)
 746
 747         if not rating:
 748             return None
 749
 750         RATING_TABLE = {
 751             'safe for kids': 0,
 752             'general': 8,
 753             '14 years': 14,
 754             'mature': 17,
 755             'restricted': 19,
 756         }
 757         return RATING_TABLE.get(rating.lower())
 758
 759     def _family_friendly_search(self, html):
 760         # See http://schema.org/VideoObject
 761         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 762
 763         if not family_friendly:
 764             return None
 765
 766         RATING_TABLE = {
 767             '1': 0,
 768             'true': 0,
 769             '0': 18,
 770             'false': 18,
 771         }
 772         return RATING_TABLE.get(family_friendly.lower())
 773
 774     def _twitter_search_player(self, html):
 775         return self._html_search_meta('twitter:player', html,
 776                                       'twitter card player')
 777
 778     def _search_json_ld(self, html, video_id, **kwargs):
 779         json_ld = self._search_regex(
 780             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 781             html, 'JSON-LD', group='json_ld', **kwargs)
 782         if not json_ld:
 783             return {}
 784         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 785
 786     def _json_ld(self, json_ld, video_id, fatal=True):
 787         if isinstance(json_ld, compat_str):
 788             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 789         if not json_ld:
 790             return {}
 791         info = {}
 792         if json_ld.get('@context') == 'http://schema.org':
 793             item_type = json_ld.get('@type')
 794             if item_type == 'TVEpisode':
 795                 info.update({
 796                     'episode': unescapeHTML(json_ld.get('name')),
 797                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 798                     'description': unescapeHTML(json_ld.get('description')),
 799                 })
 800                 part_of_season = json_ld.get('partOfSeason')
 801                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 802                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 803                 part_of_series = json_ld.get('partOfSeries')
 804                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 805                     info['series'] = unescapeHTML(part_of_series.get('name'))
 806             elif item_type == 'Article':
 807                 info.update({
 808                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 809                     'title': unescapeHTML(json_ld.get('headline')),
 810                     'description': unescapeHTML(json_ld.get('articleBody')),
 811                 })
 812         return dict((k, v) for k, v in info.items() if v is not None)
 813
 814     @staticmethod
 815     def _hidden_inputs(html):
 816         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 817         hidden_inputs = {}
 818         for input in re.findall(r'(?i)<input([^>]+)>', html):
 819             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 820                 continue
 821             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 822             if not name:
 823                 continue
 824             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 825             if not value:
 826                 continue
 827             hidden_inputs[name.group('value')] = value.group('value')
 828         return hidden_inputs
 829
 830     def _form_hidden_inputs(self, form_id, html):
 831         form = self._search_regex(
 832             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 833             html, '%s form' % form_id, group='form')
 834         return self._hidden_inputs(form)
 835
 836     def _sort_formats(self, formats, field_preference=None):
 837         if not formats:
 838             raise ExtractorError('No video formats found')
 839
 840         for f in formats:
 841             # Automatically determine tbr when missing based on abr and vbr (improves
 842             # formats sorting in some cases)
 843             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 844                 f['tbr'] = f['abr'] + f['vbr']
 845
 846         def _formats_key(f):
 847             # TODO remove the following workaround
 848             from ..utils import determine_ext
 849             if not f.get('ext') and 'url' in f:
 850                 f['ext'] = determine_ext(f['url'])
 851
 852             if isinstance(field_preference, (list, tuple)):
 853                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 854
 855             preference = f.get('preference')
 856             if preference is None:
 857                 preference = 0
 858                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 859                     preference -= 0.5
 860
 861             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 862
 863             if f.get('vcodec') == 'none':  # audio only
 864                 if self._downloader.params.get('prefer_free_formats'):
 865                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 866                 else:
 867                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 868                 ext_preference = 0
 869                 try:
 870                     audio_ext_preference = ORDER.index(f['ext'])
 871                 except ValueError:
 872                     audio_ext_preference = -1
 873             else:
 874                 if self._downloader.params.get('prefer_free_formats'):
 875                     ORDER = ['flv', 'mp4', 'webm']
 876                 else:
 877                     ORDER = ['webm', 'flv', 'mp4']
 878                 try:
 879                     ext_preference = ORDER.index(f['ext'])
 880                 except ValueError:
 881                     ext_preference = -1
 882                 audio_ext_preference = 0
 883
 884             return (
 885                 preference,
 886                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 887                 f.get('quality') if f.get('quality') is not None else -1,
 888                 f.get('tbr') if f.get('tbr') is not None else -1,
 889                 f.get('filesize') if f.get('filesize') is not None else -1,
 890                 f.get('vbr') if f.get('vbr') is not None else -1,
 891                 f.get('height') if f.get('height') is not None else -1,
 892                 f.get('width') if f.get('width') is not None else -1,
 893                 proto_preference,
 894                 ext_preference,
 895                 f.get('abr') if f.get('abr') is not None else -1,
 896                 audio_ext_preference,
 897                 f.get('fps') if f.get('fps') is not None else -1,
 898                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 899                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 900                 f.get('format_id') if f.get('format_id') is not None else '',
 901             )
 902         formats.sort(key=_formats_key)
 903
 904     def _check_formats(self, formats, video_id):
 905         if formats:
 906             formats[:] = filter(
 907                 lambda f: self._is_valid_url(
 908                     f['url'], video_id,
 909                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 910                 formats)
 911
 912     @staticmethod
 913     def _remove_duplicate_formats(formats):
 914         format_urls = set()
 915         unique_formats = []
 916         for f in formats:
 917             if f['url'] not in format_urls:
 918                 format_urls.add(f['url'])
 919                 unique_formats.append(f)
 920         formats[:] = unique_formats
 921
 922     def _is_valid_url(self, url, video_id, item='video'):
 923         url = self._proto_relative_url(url, scheme='http:')
 924         # For now assume non HTTP(S) URLs always valid
 925         if not (url.startswith('http://') or url.startswith('https://')):
 926             return True
 927         try:
 928             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 929             return True
 930         except ExtractorError as e:
 931             if isinstance(e.cause, compat_urllib_error.URLError):
 932                 self.to_screen(
 933                     '%s: %s URL is invalid, skipping' % (video_id, item))
 934                 return False
 935             raise
 936
 937     def http_scheme(self):
 938         """ Either "http:" or "https:", depending on the user's preferences """
 939         return (
 940             'http:'
 941             if self._downloader.params.get('prefer_insecure', False)
 942             else 'https:')
 943
 944     def _proto_relative_url(self, url, scheme=None):
 945         if url is None:
 946             return url
 947         if url.startswith('//'):
 948             if scheme is None:
 949                 scheme = self.http_scheme()
 950             return scheme + url
 951         else:
 952             return url
 953
 954     def _sleep(self, timeout, video_id, msg_template=None):
 955         if msg_template is None:
 956             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 957         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 958         self.to_screen(msg)
 959         time.sleep(timeout)
 960
 961     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 962                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 963                              fatal=True):
 964         manifest = self._download_xml(
 965             manifest_url, video_id, 'Downloading f4m manifest',
 966             'Unable to download f4m manifest',
 967             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 968             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 969             transform_source=transform_source,
 970             fatal=fatal)
 971
 972         if manifest is False:
 973             return []
 974
 975         formats = []
 976         manifest_version = '1.0'
 977         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 978         if not media_nodes:
 979             manifest_version = '2.0'
 980             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 981         base_url = xpath_text(
 982             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 983             'base URL', default=None)
 984         if base_url:
 985             base_url = base_url.strip()
 986         for i, media_el in enumerate(media_nodes):
 987             if manifest_version == '2.0':
 988                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 989                 if not media_url:
 990                     continue
 991                 manifest_url = (
 992                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 993                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 994                 # If media_url is itself a f4m manifest do the recursive extraction
 995                 # since bitrates in parent manifest (this one) and media_url manifest
 996                 # may differ leading to inability to resolve the format by requested
 997                 # bitrate in f4m downloader
 998                 if determine_ext(manifest_url) == 'f4m':
 999                     formats.extend(self._extract_f4m_formats(
1000                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
1001                     continue
1002             tbr = int_or_none(media_el.attrib.get('bitrate'))
1003             formats.append({
1004                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1005                 'url': manifest_url,
1006                 'ext': 'flv',
1007                 'tbr': tbr,
1008                 'width': int_or_none(media_el.attrib.get('width')),
1009                 'height': int_or_none(media_el.attrib.get('height')),
1010                 'preference': preference,
1011             })
1012         self._sort_formats(formats)
1013
1014         return formats
1015
1016     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1017                               entry_protocol='m3u8', preference=None,
1018                               m3u8_id=None, note=None, errnote=None,
1019                               fatal=True):
1020
1021         formats = [{
1022             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1023             'url': m3u8_url,
1024             'ext': ext,
1025             'protocol': 'm3u8',
1026             'preference': preference - 1 if preference else -1,
1027             'resolution': 'multiple',
1028             'format_note': 'Quality selection URL',
1029         }]
1030
1031         format_url = lambda u: (
1032             u
1033             if re.match(r'^https?://', u)
1034             else compat_urlparse.urljoin(m3u8_url, u))
1035
1036         res = self._download_webpage_handle(
1037             m3u8_url, video_id,
1038             note=note or 'Downloading m3u8 information',
1039             errnote=errnote or 'Failed to download m3u8 information',
1040             fatal=fatal)
1041         if res is False:
1042             return []
1043         m3u8_doc, urlh = res
1044         m3u8_url = urlh.geturl()
1045
1046         # We should try extracting formats only from master playlists [1], i.e.
1047         # playlists that describe available qualities. On the other hand media
1048         # playlists [2] should be returned as is since they contain just the media
1049         # without qualities renditions.
1050         # Fortunately, master playlist can be easily distinguished from media
1051         # playlist based on particular tags availability. As of [1, 2] master
1052         # playlist tags MUST NOT appear in a media playist and vice versa.
1053         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1054         # and MUST NOT appear in master playlist thus we can clearly detect media
1055         # playlist with this criterion.
1056         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1057         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1058         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1059         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1060             return [{
1061                 'url': m3u8_url,
1062                 'format_id': m3u8_id,
1063                 'ext': ext,
1064                 'protocol': entry_protocol,
1065                 'preference': preference,
1066             }]
1067         last_info = None
1068         last_media = None
1069         kv_rex = re.compile(
1070             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1071         for line in m3u8_doc.splitlines():
1072             if line.startswith('#EXT-X-STREAM-INF:'):
1073                 last_info = {}
1074                 for m in kv_rex.finditer(line):
1075                     v = m.group('val')
1076                     if v.startswith('"'):
1077                         v = v[1:-1]
1078                     last_info[m.group('key')] = v
1079             elif line.startswith('#EXT-X-MEDIA:'):
1080                 last_media = {}
1081                 for m in kv_rex.finditer(line):
1082                     v = m.group('val')
1083                     if v.startswith('"'):
1084                         v = v[1:-1]
1085                     last_media[m.group('key')] = v
1086             elif line.startswith('#') or not line.strip():
1087                 continue
1088             else:
1089                 if last_info is None:
1090                     formats.append({'url': format_url(line)})
1091                     continue
1092                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1093                 format_id = []
1094                 if m3u8_id:
1095                     format_id.append(m3u8_id)
1096                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1097                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1098                 f = {
1099                     'format_id': '-'.join(format_id),
1100                     'url': format_url(line.strip()),
1101                     'tbr': tbr,
1102                     'ext': ext,
1103                     'protocol': entry_protocol,
1104                     'preference': preference,
1105                 }
1106                 resolution = last_info.get('RESOLUTION')
1107                 if resolution:
1108                     width_str, height_str = resolution.split('x')
1109                     f['width'] = int(width_str)
1110                     f['height'] = int(height_str)
1111                 codecs = last_info.get('CODECS')
1112                 if codecs:
1113                     vcodec, acodec = [None] * 2
1114                     va_codecs = codecs.split(',')
1115                     if len(va_codecs) == 1:
1116                         # Audio only entries usually come with single codec and
1117                         # no resolution. For more robustness we also check it to
1118                         # be mp4 audio.
1119                         if not resolution and va_codecs[0].startswith('mp4a'):
1120                             vcodec, acodec = 'none', va_codecs[0]
1121                         else:
1122                             vcodec = va_codecs[0]
1123                     else:
1124                         vcodec, acodec = va_codecs[:2]
1125                     f.update({
1126                         'acodec': acodec,
1127                         'vcodec': vcodec,
1128                     })
1129                 if last_media is not None:
1130                     f['m3u8_media'] = last_media
1131                     last_media = None
1132                 formats.append(f)
1133                 last_info = {}
1134         self._sort_formats(formats)
1135         return formats
1136
1137     @staticmethod
1138     def _xpath_ns(path, namespace=None):
1139         if not namespace:
1140             return path
1141         out = []
1142         for c in path.split('/'):
1143             if not c or c == '.':
1144                 out.append(c)
1145             else:
1146                 out.append('{%s}%s' % (namespace, c))
1147         return '/'.join(out)
1148
1149     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1150         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1151
1152         if smil is False:
1153             assert not fatal
1154             return []
1155
1156         namespace = self._parse_smil_namespace(smil)
1157
1158         return self._parse_smil_formats(
1159             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1160
1161     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1162         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1163         if smil is False:
1164             return {}
1165         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1166
1167     def _download_smil(self, smil_url, video_id, fatal=True):
1168         return self._download_xml(
1169             smil_url, video_id, 'Downloading SMIL file',
1170             'Unable to download SMIL file', fatal=fatal)
1171
1172     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1173         namespace = self._parse_smil_namespace(smil)
1174
1175         formats = self._parse_smil_formats(
1176             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1177         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1178
1179         video_id = os.path.splitext(url_basename(smil_url))[0]
1180         title = None
1181         description = None
1182         upload_date = None
1183         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1184             name = meta.attrib.get('name')
1185             content = meta.attrib.get('content')
1186             if not name or not content:
1187                 continue
1188             if not title and name == 'title':
1189                 title = content
1190             elif not description and name in ('description', 'abstract'):
1191                 description = content
1192             elif not upload_date and name == 'date':
1193                 upload_date = unified_strdate(content)
1194
1195         thumbnails = [{
1196             'id': image.get('type'),
1197             'url': image.get('src'),
1198             'width': int_or_none(image.get('width')),
1199             'height': int_or_none(image.get('height')),
1200         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1201
1202         return {
1203             'id': video_id,
1204             'title': title or video_id,
1205             'description': description,
1206             'upload_date': upload_date,
1207             'thumbnails': thumbnails,
1208             'formats': formats,
1209             'subtitles': subtitles,
1210         }
1211
1212     def _parse_smil_namespace(self, smil):
1213         return self._search_regex(
1214             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1215
1216     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1217         base = smil_url
1218         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1219             b = meta.get('base') or meta.get('httpBase')
1220             if b:
1221                 base = b
1222                 break
1223
1224         formats = []
1225         rtmp_count = 0
1226         http_count = 0
1227         m3u8_count = 0
1228
1229         srcs = []
1230         videos = smil.findall(self._xpath_ns('.//video', namespace))
1231         for video in videos:
1232             src = video.get('src')
1233             if not src or src in srcs:
1234                 continue
1235             srcs.append(src)
1236
1237             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1238             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1239             width = int_or_none(video.get('width'))
1240             height = int_or_none(video.get('height'))
1241             proto = video.get('proto')
1242             ext = video.get('ext')
1243             src_ext = determine_ext(src)
1244             streamer = video.get('streamer') or base
1245
1246             if proto == 'rtmp' or streamer.startswith('rtmp'):
1247                 rtmp_count += 1
1248                 formats.append({
1249                     'url': streamer,
1250                     'play_path': src,
1251                     'ext': 'flv',
1252                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1253                     'tbr': bitrate,
1254                     'filesize': filesize,
1255                     'width': width,
1256                     'height': height,
1257                 })
1258                 if transform_rtmp_url:
1259                     streamer, src = transform_rtmp_url(streamer, src)
1260                     formats[-1].update({
1261                         'url': streamer,
1262                         'play_path': src,
1263                     })
1264                 continue
1265
1266             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1267             src_url = src_url.strip()
1268
1269             if proto == 'm3u8' or src_ext == 'm3u8':
1270                 m3u8_formats = self._extract_m3u8_formats(
1271                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1272                 if len(m3u8_formats) == 1:
1273                     m3u8_count += 1
1274                     m3u8_formats[0].update({
1275                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1276                         'tbr': bitrate,
1277                         'width': width,
1278                         'height': height,
1279                     })
1280                 formats.extend(m3u8_formats)
1281                 continue
1282
1283             if src_ext == 'f4m':
1284                 f4m_url = src_url
1285                 if not f4m_params:
1286                     f4m_params = {
1287                         'hdcore': '3.2.0',
1288                         'plugin': 'flowplayer-3.2.0.1',
1289                     }
1290                 f4m_url += '&' if '?' in f4m_url else '?'
1291                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1292                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1293                 continue
1294
1295             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1296                 http_count += 1
1297                 formats.append({
1298                     'url': src_url,
1299                     'ext': ext or src_ext or 'flv',
1300                     'format_id': 'http-%d' % (bitrate or http_count),
1301                     'tbr': bitrate,
1302                     'filesize': filesize,
1303                     'width': width,
1304                     'height': height,
1305                 })
1306                 continue
1307
1308         self._sort_formats(formats)
1309
1310         return formats
1311
1312     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1313         urls = []
1314         subtitles = {}
1315         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1316             src = textstream.get('src')
1317             if not src or src in urls:
1318                 continue
1319             urls.append(src)
1320             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1321             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1322             subtitles.setdefault(lang, []).append({
1323                 'url': src,
1324                 'ext': ext,
1325             })
1326         return subtitles
1327
1328     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1329         xspf = self._download_xml(
1330             playlist_url, playlist_id, 'Downloading xpsf playlist',
1331             'Unable to download xspf manifest', fatal=fatal)
1332         if xspf is False:
1333             return []
1334         return self._parse_xspf(xspf, playlist_id)
1335
1336     def _parse_xspf(self, playlist, playlist_id):
1337         NS_MAP = {
1338             'xspf': 'http://xspf.org/ns/0/',
1339             's1': 'http://static.streamone.nl/player/ns/0',
1340         }
1341
1342         entries = []
1343         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1344             title = xpath_text(
1345                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1346             description = xpath_text(
1347                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1348             thumbnail = xpath_text(
1349                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1350             duration = float_or_none(
1351                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1352
1353             formats = [{
1354                 'url': location.text,
1355                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1356                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1357                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1358             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1359             self._sort_formats(formats)
1360
1361             entries.append({
1362                 'id': playlist_id,
1363                 'title': title,
1364                 'description': description,
1365                 'thumbnail': thumbnail,
1366                 'duration': duration,
1367                 'formats': formats,
1368             })
1369         return entries
1370
1371     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1372         res = self._download_webpage_handle(
1373             mpd_url, video_id,
1374             note=note or 'Downloading MPD manifest',
1375             errnote=errnote or 'Failed to download MPD manifest',
1376             fatal=fatal)
1377         if res is False:
1378             return []
1379         mpd, urlh = res
1380         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1381
1382         return self._parse_mpd_formats(
1383             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1384
1385     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1386         if mpd_doc.get('type') == 'dynamic':
1387             return []
1388
1389         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1390
1391         def _add_ns(path):
1392             return self._xpath_ns(path, namespace)
1393
1394         def is_drm_protected(element):
1395             return element.find(_add_ns('ContentProtection')) is not None
1396
1397         def extract_multisegment_info(element, ms_parent_info):
1398             ms_info = ms_parent_info.copy()
1399             segment_list = element.find(_add_ns('SegmentList'))
1400             if segment_list is not None:
1401                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1402                 if segment_urls_e:
1403                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1404                 initialization = segment_list.find(_add_ns('Initialization'))
1405                 if initialization is not None:
1406                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1407             else:
1408                 segment_template = element.find(_add_ns('SegmentTemplate'))
1409                 if segment_template is not None:
1410                     start_number = segment_template.get('startNumber')
1411                     if start_number:
1412                         ms_info['start_number'] = int(start_number)
1413                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1414                     if segment_timeline is not None:
1415                         s_e = segment_timeline.findall(_add_ns('S'))
1416                         if s_e:
1417                             ms_info['total_number'] = 0
1418                             for s in s_e:
1419                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1420                     else:
1421                         timescale = segment_template.get('timescale')
1422                         if timescale:
1423                             ms_info['timescale'] = int(timescale)
1424                         segment_duration = segment_template.get('duration')
1425                         if segment_duration:
1426                             ms_info['segment_duration'] = int(segment_duration)
1427                     media_template = segment_template.get('media')
1428                     if media_template:
1429                         ms_info['media_template'] = media_template
1430                     initialization = segment_template.get('initialization')
1431                     if initialization:
1432                         ms_info['initialization_url'] = initialization
1433                     else:
1434                         initialization = segment_template.find(_add_ns('Initialization'))
1435                         if initialization is not None:
1436                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1437             return ms_info
1438
1439         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1440         formats = []
1441         for period in mpd_doc.findall(_add_ns('Period')):
1442             period_duration = parse_duration(period.get('duration')) or mpd_duration
1443             period_ms_info = extract_multisegment_info(period, {
1444                 'start_number': 1,
1445                 'timescale': 1,
1446             })
1447             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1448                 if is_drm_protected(adaptation_set):
1449                     continue
1450                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1451                 for representation in adaptation_set.findall(_add_ns('Representation')):
1452                     if is_drm_protected(representation):
1453                         continue
1454                     representation_attrib = adaptation_set.attrib.copy()
1455                     representation_attrib.update(representation.attrib)
1456                     mime_type = representation_attrib.get('mimeType')
1457                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1458                     if content_type == 'text':
1459                         # TODO implement WebVTT downloading
1460                         pass
1461                     elif content_type == 'video' or content_type == 'audio':
1462                         base_url = ''
1463                         for element in (representation, adaptation_set, period, mpd_doc):
1464                             base_url_e = element.find(_add_ns('BaseURL'))
1465                             if base_url_e is not None:
1466                                 base_url = base_url_e.text + base_url
1467                                 if re.match(r'^https?://', base_url):
1468                                     break
1469                         if mpd_base_url and not re.match(r'^https?://', base_url):
1470                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1471                                 mpd_base_url += '/'
1472                             base_url = mpd_base_url + base_url
1473                         representation_id = representation_attrib.get('id')
1474                         lang = representation_attrib.get('lang')
1475                         url_el = representation.find(_add_ns('BaseURL'))
1476                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1477                         f = {
1478                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1479                             'url': base_url,
1480                             'width': int_or_none(representation_attrib.get('width')),
1481                             'height': int_or_none(representation_attrib.get('height')),
1482                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1483                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1484                             'fps': int_or_none(representation_attrib.get('frameRate')),
1485                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1486                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1487                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1488                             'format_note': 'DASH %s' % content_type,
1489                             'filesize': filesize,
1490                         }
1491                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1492                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1493                             if 'total_number' not in representation_ms_info and 'segment_duration':
1494                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1495                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1496                             media_template = representation_ms_info['media_template']
1497                             media_template = media_template.replace('$RepresentationID$', representation_id)
1498                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1499                             media_template.replace('$$', '$')
1500                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1501                         if 'segment_urls' in representation_ms_info:
1502                             f.update({
1503                                 'segment_urls': representation_ms_info['segment_urls'],
1504                                 'protocol': 'http_dash_segments',
1505                             })
1506                             if 'initialization_url' in representation_ms_info:
1507                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1508                                 f.update({
1509                                     'initialization_url': initialization_url,
1510                                 })
1511                                 if not f.get('url'):
1512                                     f['url'] = initialization_url
1513                         try:
1514                             existing_format = next(
1515                                 fo for fo in formats
1516                                 if fo['format_id'] == representation_id)
1517                         except StopIteration:
1518                             full_info = formats_dict.get(representation_id, {}).copy()
1519                             full_info.update(f)
1520                             formats.append(full_info)
1521                         else:
1522                             existing_format.update(f)
1523                     else:
1524                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1525         self._sort_formats(formats)
1526         return formats
1527
1528     def _live_title(self, name):
1529         """ Generate the title for a live video """
1530         now = datetime.datetime.now()
1531         now_str = now.strftime('%Y-%m-%d %H:%M')
1532         return name + ' ' + now_str
1533
1534     def _int(self, v, name, fatal=False, **kwargs):
1535         res = int_or_none(v, **kwargs)
1536         if 'get_attr' in kwargs:
1537             print(getattr(v, kwargs['get_attr']))
1538         if res is None:
1539             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1540             if fatal:
1541                 raise ExtractorError(msg)
1542             else:
1543                 self._downloader.report_warning(msg)
1544         return res
1545
1546     def _float(self, v, name, fatal=False, **kwargs):
1547         res = float_or_none(v, **kwargs)
1548         if res is None:
1549             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1550             if fatal:
1551                 raise ExtractorError(msg)
1552             else:
1553                 self._downloader.report_warning(msg)
1554         return res
1555
1556     def _set_cookie(self, domain, name, value, expire_time=None):
1557         cookie = compat_cookiejar.Cookie(
1558             0, name, value, None, None, domain, None,
1559             None, '/', True, False, expire_time, '', None, None, None)
1560         self._downloader.cookiejar.set_cookie(cookie)
1561
1562     def _get_cookies(self, url):
1563         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1564         req = sanitized_Request(url)
1565         self._downloader.cookiejar.add_cookie_header(req)
1566         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1567
1568     def get_testcases(self, include_onlymatching=False):
1569         t = getattr(self, '_TEST', None)
1570         if t:
1571             assert not hasattr(self, '_TESTS'), \
1572                 '%s has _TEST and _TESTS' % type(self).__name__
1573             tests = [t]
1574         else:
1575             tests = getattr(self, '_TESTS', [])
1576         for t in tests:
1577             if not include_onlymatching and t.get('only_matching', False):
1578                 continue
1579             t['name'] = type(self).__name__[:-len('IE')]
1580             yield t
1581
1582     def is_suitable(self, age_limit):
1583         """ Test whether the extractor is generally suitable for the given
1584         age limit (i.e. pornographic sites are not, all others usually are) """
1585
1586         any_restricted = False
1587         for tc in self.get_testcases(include_onlymatching=False):
1588             if 'playlist' in tc:
1589                 tc = tc['playlist'][0]
1590             is_restricted = age_restricted(
1591                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1592             if not is_restricted:
1593                 return True
1594             any_restricted = any_restricted or is_restricted
1595         return not any_restricted
1596
1597     def extract_subtitles(self, *args, **kwargs):
1598         if (self._downloader.params.get('writesubtitles', False) or
1599                 self._downloader.params.get('listsubtitles')):
1600             return self._get_subtitles(*args, **kwargs)
1601         return {}
1602
1603     def _get_subtitles(self, *args, **kwargs):
1604         raise NotImplementedError('This method must be implemented by subclasses')
1605
1606     @staticmethod
1607     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1608         """ Merge subtitle items for one language. Items with duplicated URLs
1609         will be dropped. """
1610         list1_urls = set([item['url'] for item in subtitle_list1])
1611         ret = list(subtitle_list1)
1612         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1613         return ret
1614
1615     @classmethod
1616     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1617         """ Merge two subtitle dictionaries, language by language. """
1618         ret = dict(subtitle_dict1)
1619         for lang in subtitle_dict2:
1620             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1621         return ret
1622
1623     def extract_automatic_captions(self, *args, **kwargs):
1624         if (self._downloader.params.get('writeautomaticsub', False) or
1625                 self._downloader.params.get('listsubtitles')):
1626             return self._get_automatic_captions(*args, **kwargs)
1627         return {}
1628
1629     def _get_automatic_captions(self, *args, **kwargs):
1630         raise NotImplementedError('This method must be implemented by subclasses')
1631
1632     def mark_watched(self, *args, **kwargs):
1633         if (self._downloader.params.get('mark_watched', False) and
1634                 (self._get_login_info()[0] is not None or
1635                     self._downloader.params.get('cookiefile') is not None)):
1636             self._mark_watched(*args, **kwargs)
1637
1638     def _mark_watched(self, *args, **kwargs):
1639         raise NotImplementedError('This method must be implemented by subclasses')
1640
1641
1642 class SearchInfoExtractor(InfoExtractor):
1643     """
1644     Base class for paged search queries extractors.
1645     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1646     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1647     """
1648
1649     @classmethod
1650     def _make_valid_url(cls):
1651         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1652
1653     @classmethod
1654     def suitable(cls, url):
1655         return re.match(cls._make_valid_url(), url) is not None
1656
1657     def _real_extract(self, query):
1658         mobj = re.match(self._make_valid_url(), query)
1659         if mobj is None:
1660             raise ExtractorError('Invalid search query "%s"' % query)
1661
1662         prefix = mobj.group('prefix')
1663         query = mobj.group('query')
1664         if prefix == '':
1665             return self._get_n_results(query, 1)
1666         elif prefix == 'all':
1667             return self._get_n_results(query, self._MAX_RESULTS)
1668         else:
1669             n = int(prefix)
1670             if n <= 0:
1671                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1672             elif n > self._MAX_RESULTS:
1673                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1674                 n = self._MAX_RESULTS
1675             return self._get_n_results(query, n)
1676
1677     def _get_n_results(self, query, n):
1678         """Get a specified number of results for a query"""
1679         raise NotImplementedError('This method must be implemented by subclasses')
1680
1681     @property
1682     def SEARCH_KEY(self):
1683         return self._SEARCH_KEY