_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49 )
  50
  51
  52 class InfoExtractor(object):
  53     """Information Extractor class.
  54
  55     Information extractors are the classes that, given a URL, extract
  56     information about the video (or videos) the URL refers to. This
  57     information includes the real video URL, the video title, author and
  58     others. The information is stored in a dictionary which is then
  59     passed to the YoutubeDL. The YoutubeDL processes this
  60     information possibly downloading the video to the file system, among
  61     other possible outcomes.
  62
  63     The type field determines the type of the result.
  64     By far the most common value (and the default if _type is missing) is
  65     "video", which indicates a single video.
  66
  67     For a video, the dictionaries must include the following fields:
  68
  69     id:             Video identifier.
  70     title:          Video title, unescaped.
  71
  72     Additionally, it must contain either a formats entry or a url one:
  73
  74     formats:        A list of dictionaries for each format available, ordered
  75                     from worst to best quality.
  76
  77                     Potential fields:
  78                     * url        Mandatory. The URL of the video file
  79                     * ext        Will be calculated from URL if missing
  80                     * format     A human-readable description of the format
  81                                  ("mp4 container with h264/opus").
  82                                  Calculated from the format_id, width, height.
  83                                  and format_note fields if missing.
  84                     * format_id  A short description of the format
  85                                  ("mp4_h264_opus" or "19").
  86                                 Technically optional, but strongly recommended.
  87                     * format_note Additional info about the format
  88                                  ("3D" or "DASH video")
  89                     * width      Width of the video, if known
  90                     * height     Height of the video, if known
  91                     * resolution Textual description of width and height
  92                     * tbr        Average bitrate of audio and video in KBit/s
  93                     * abr        Average audio bitrate in KBit/s
  94                     * acodec     Name of the audio codec in use
  95                     * asr        Audio sampling rate in Hertz
  96                     * vbr        Average video bitrate in KBit/s
  97                     * fps        Frame rate
  98                     * vcodec     Name of the video codec in use
  99                     * container  Name of the container format
 100                     * filesize   The number of bytes, if known in advance
 101                     * filesize_approx  An estimate for the number of bytes
 102                     * player_url SWF Player URL (used for rtmpdump).
 103                     * protocol   The protocol that will be used for the actual
 104                                  download, lower-case.
 105                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 106                                  "m3u8", or "m3u8_native".
 107                     * preference Order number of this format. If this field is
 108                                  present and not None, the formats get sorted
 109                                  by this field, regardless of all other values.
 110                                  -1 for default (order by other properties),
 111                                  -2 or smaller for less than default.
 112                                  < -1000 to hide the format (if there is
 113                                     another one which is strictly better)
 114                     * language   Language code, e.g. "de" or "en-US".
 115                     * language_preference  Is this in the language mentioned in
 116                                  the URL?
 117                                  10 if it's what the URL is about,
 118                                  -1 for default (don't know),
 119                                  -10 otherwise, other values reserved for now.
 120                     * quality    Order number of the video quality of this
 121                                  format, irrespective of the file format.
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * source_preference  Order number for this video source
 125                                   (quality takes higher priority)
 126                                  -1 for default (order by other properties),
 127                                  -2 or smaller for less than default.
 128                     * http_headers  A dictionary of additional HTTP headers
 129                                  to add to the request.
 130                     * stretched_ratio  If given and not 1, indicates that the
 131                                  video's pixels are not square.
 132                                  width : height ratio as float.
 133                     * no_resume  The server does not support resuming the
 134                                  (HTTP or RTMP) download. Boolean.
 135
 136     url:            Final video URL.
 137     ext:            Video filename extension.
 138     format:         The video format, defaults to ext (used for --get-format)
 139     player_url:     SWF Player URL (used for rtmpdump).
 140
 141     The following fields are optional:
 142
 143     alt_title:      A secondary title of the video.
 144     display_id      An alternative identifier for the video, not necessarily
 145                     unique, but available before title. Typically, id is
 146                     something like "4234987", title "Dancing naked mole rats",
 147                     and display_id "dancing-naked-mole-rats"
 148     thumbnails:     A list of dictionaries, with the following entries:
 149                         * "id" (optional, string) - Thumbnail format ID
 150                         * "url"
 151                         * "preference" (optional, int) - quality of the image
 152                         * "width" (optional, int)
 153                         * "height" (optional, int)
 154                         * "resolution" (optional, string "{width}x{height"},
 155                                         deprecated)
 156     thumbnail:      Full URL to a video thumbnail image.
 157     description:    Full video description.
 158     uploader:       Full name of the video uploader.
 159     creator:        The main artist who created the video.
 160     release_date:   The date (YYYYMMDD) when the video was released.
 161     timestamp:      UNIX timestamp of the moment the video became available.
 162     upload_date:    Video upload date (YYYYMMDD).
 163                     If not explicitly set, calculated from timestamp.
 164     uploader_id:    Nickname or id of the video uploader.
 165     location:       Physical location where the video was filmed.
 166     subtitles:      The available subtitles as a dictionary in the format
 167                     {language: subformats}. "subformats" is a list sorted from
 168                     lower to higher preference, each element is a dictionary
 169                     with the "ext" entry and one of:
 170                         * "data": The subtitles file contents
 171                         * "url": A URL pointing to the subtitles file
 172                     "ext" will be calculated from URL if missing
 173     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 174                     automatically generated captions
 175     duration:       Length of the video in seconds, as an integer or float.
 176     view_count:     How many users have watched the video on the platform.
 177     like_count:     Number of positive ratings of the video
 178     dislike_count:  Number of negative ratings of the video
 179     repost_count:   Number of reposts of the video
 180     average_rating: Average rating give by users, the scale used depends on the webpage
 181     comment_count:  Number of comments on the video
 182     comments:       A list of comments, each with one or more of the following
 183                     properties (all but one of text or html optional):
 184                         * "author" - human-readable name of the comment author
 185                         * "author_id" - user ID of the comment author
 186                         * "id" - Comment ID
 187                         * "html" - Comment as HTML
 188                         * "text" - Plain text of the comment
 189                         * "timestamp" - UNIX timestamp of comment
 190                         * "parent" - ID of the comment this one is replying to.
 191                                      Set to "root" to indicate that this is a
 192                                      comment to the original video.
 193     age_limit:      Age restriction for the video, as an integer (years)
 194     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 195                     should allow to get the same result again. (It will be set
 196                     by YoutubeDL if it's missing)
 197     categories:     A list of categories that the video falls in, for example
 198                     ["Sports", "Berlin"]
 199     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 200     is_live:        True, False, or None (=unknown). Whether this video is a
 201                     live stream that goes on instead of a fixed-length video.
 202     start_time:     Time in seconds where the reproduction should start, as
 203                     specified in the URL.
 204     end_time:       Time in seconds where the reproduction should end, as
 205                     specified in the URL.
 206
 207     The following fields should only be used when the video belongs to some logical
 208     chapter or section:
 209
 210     chapter:        Name or title of the chapter the video belongs to.
 211     chapter_number: Number of the chapter the video belongs to, as an integer.
 212     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 213
 214     The following fields should only be used when the video is an episode of some
 215     series or programme:
 216
 217     series:         Title of the series or programme the video episode belongs to.
 218     season:         Title of the season the video episode belongs to.
 219     season_number:  Number of the season the video episode belongs to, as an integer.
 220     season_id:      Id of the season the video episode belongs to, as a unicode string.
 221     episode:        Title of the video episode. Unlike mandatory video title field,
 222                     this field should denote the exact title of the video episode
 223                     without any kind of decoration.
 224     episode_number: Number of the video episode within a season, as an integer.
 225     episode_id:     Id of the video episode, as a unicode string.
 226
 227     Unless mentioned otherwise, the fields should be Unicode strings.
 228
 229     Unless mentioned otherwise, None is equivalent to absence of information.
 230
 231
 232     _type "playlist" indicates multiple videos.
 233     There must be a key "entries", which is a list, an iterable, or a PagedList
 234     object, each element of which is a valid dictionary by this specification.
 235
 236     Additionally, playlists can have "title", "description" and "id" attributes
 237     with the same semantics as videos (see above).
 238
 239
 240     _type "multi_video" indicates that there are multiple videos that
 241     form a single show, for examples multiple acts of an opera or TV episode.
 242     It must have an entries key like a playlist and contain all the keys
 243     required for a video at the same time.
 244
 245
 246     _type "url" indicates that the video must be extracted from another
 247     location, possibly by a different extractor. Its only required key is:
 248     "url" - the next URL to extract.
 249     The key "ie_key" can be set to the class name (minus the trailing "IE",
 250     e.g. "Youtube") if the extractor class is known in advance.
 251     Additionally, the dictionary may have any properties of the resolved entity
 252     known in advance, for example "title" if the title of the referred video is
 253     known ahead of time.
 254
 255
 256     _type "url_transparent" entities have the same specification as "url", but
 257     indicate that the given additional information is more precise than the one
 258     associated with the resolved URL.
 259     This is useful when a site employs a video service that hosts the video and
 260     its technical metadata, but that video service does not embed a useful
 261     title, description etc.
 262
 263
 264     Subclasses of this one should re-define the _real_initialize() and
 265     _real_extract() methods and define a _VALID_URL regexp.
 266     Probably, they should also be added to the list of extractors.
 267
 268     Finally, the _WORKING attribute should be set to False for broken IEs
 269     in order to warn the users and skip the tests.
 270     """
 271
 272     _ready = False
 273     _downloader = None
 274     _WORKING = True
 275
 276     def __init__(self, downloader=None):
 277         """Constructor. Receives an optional downloader."""
 278         self._ready = False
 279         self.set_downloader(downloader)
 280
 281     @classmethod
 282     def suitable(cls, url):
 283         """Receives a URL and returns True if suitable for this IE."""
 284
 285         # This does not use has/getattr intentionally - we want to know whether
 286         # we have cached the regexp for *this* class, whereas getattr would also
 287         # match the superclass
 288         if '_VALID_URL_RE' not in cls.__dict__:
 289             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 290         return cls._VALID_URL_RE.match(url) is not None
 291
 292     @classmethod
 293     def _match_id(cls, url):
 294         if '_VALID_URL_RE' not in cls.__dict__:
 295             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 296         m = cls._VALID_URL_RE.match(url)
 297         assert m
 298         return m.group('id')
 299
 300     @classmethod
 301     def working(cls):
 302         """Getter method for _WORKING."""
 303         return cls._WORKING
 304
 305     def initialize(self):
 306         """Initializes an instance (authentication, etc)."""
 307         if not self._ready:
 308             self._real_initialize()
 309             self._ready = True
 310
 311     def extract(self, url):
 312         """Extracts URL information and returns it in list of dicts."""
 313         try:
 314             self.initialize()
 315             return self._real_extract(url)
 316         except ExtractorError:
 317             raise
 318         except compat_http_client.IncompleteRead as e:
 319             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 320         except (KeyError, StopIteration) as e:
 321             raise ExtractorError('An extractor error has occurred.', cause=e)
 322
 323     def set_downloader(self, downloader):
 324         """Sets the downloader for this IE."""
 325         self._downloader = downloader
 326
 327     def _real_initialize(self):
 328         """Real initialization process. Redefine in subclasses."""
 329         pass
 330
 331     def _real_extract(self, url):
 332         """Real extraction process. Redefine in subclasses."""
 333         pass
 334
 335     @classmethod
 336     def ie_key(cls):
 337         """A string for getting the InfoExtractor with get_info_extractor"""
 338         return compat_str(cls.__name__[:-2])
 339
 340     @property
 341     def IE_NAME(self):
 342         return compat_str(type(self).__name__[:-2])
 343
 344     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 345         """ Returns the response handle """
 346         if note is None:
 347             self.report_download_webpage(video_id)
 348         elif note is not False:
 349             if video_id is None:
 350                 self.to_screen('%s' % (note,))
 351             else:
 352                 self.to_screen('%s: %s' % (video_id, note))
 353         try:
 354             return self._downloader.urlopen(url_or_request)
 355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 356             if errnote is False:
 357                 return False
 358             if errnote is None:
 359                 errnote = 'Unable to download webpage'
 360
 361             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 362             if fatal:
 363                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 364             else:
 365                 self._downloader.report_warning(errmsg)
 366                 return False
 367
 368     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 369         """ Returns a tuple (page content as string, URL handle) """
 370         # Strip hashes from the URL (#1038)
 371         if isinstance(url_or_request, (compat_str, str)):
 372             url_or_request = url_or_request.partition('#')[0]
 373
 374         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 375         if urlh is False:
 376             assert not fatal
 377             return False
 378         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 379         return (content, urlh)
 380
 381     @staticmethod
 382     def _guess_encoding_from_content(content_type, webpage_bytes):
 383         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 384         if m:
 385             encoding = m.group(1)
 386         else:
 387             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 388                           webpage_bytes[:1024])
 389             if m:
 390                 encoding = m.group(1).decode('ascii')
 391             elif webpage_bytes.startswith(b'\xff\xfe'):
 392                 encoding = 'utf-16'
 393             else:
 394                 encoding = 'utf-8'
 395
 396         return encoding
 397
 398     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 399         content_type = urlh.headers.get('Content-Type', '')
 400         webpage_bytes = urlh.read()
 401         if prefix is not None:
 402             webpage_bytes = prefix + webpage_bytes
 403         if not encoding:
 404             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 405         if self._downloader.params.get('dump_intermediate_pages', False):
 406             try:
 407                 url = url_or_request.get_full_url()
 408             except AttributeError:
 409                 url = url_or_request
 410             self.to_screen('Dumping request to ' + url)
 411             dump = base64.b64encode(webpage_bytes).decode('ascii')
 412             self._downloader.to_screen(dump)
 413         if self._downloader.params.get('write_pages', False):
 414             try:
 415                 url = url_or_request.get_full_url()
 416             except AttributeError:
 417                 url = url_or_request
 418             basen = '%s_%s' % (video_id, url)
 419             if len(basen) > 240:
 420                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 421                 basen = basen[:240 - len(h)] + h
 422             raw_filename = basen + '.dump'
 423             filename = sanitize_filename(raw_filename, restricted=True)
 424             self.to_screen('Saving request to ' + filename)
 425             # Working around MAX_PATH limitation on Windows (see
 426             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 427             if os.name == 'nt':
 428                 absfilepath = os.path.abspath(filename)
 429                 if len(absfilepath) > 259:
 430                     filename = '\\\\?\\' + absfilepath
 431             with open(filename, 'wb') as outf:
 432                 outf.write(webpage_bytes)
 433
 434         try:
 435             content = webpage_bytes.decode(encoding, 'replace')
 436         except LookupError:
 437             content = webpage_bytes.decode('utf-8', 'replace')
 438
 439         if ('<title>Access to this site is blocked</title>' in content and
 440                 'Websense' in content[:512]):
 441             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 442             blocked_iframe = self._html_search_regex(
 443                 r'<iframe src="([^"]+)"', content,
 444                 'Websense information URL', default=None)
 445             if blocked_iframe:
 446                 msg += ' Visit %s for more details' % blocked_iframe
 447             raise ExtractorError(msg, expected=True)
 448         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 449             msg = (
 450                 'Access to this webpage has been blocked by Indian censorship. '
 451                 'Use a VPN or proxy server (with --proxy) to route around it.')
 452             block_msg = self._html_search_regex(
 453                 r'</h1><p>(.*?)</p>',
 454                 content, 'block message', default=None)
 455             if block_msg:
 456                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 457             raise ExtractorError(msg, expected=True)
 458
 459         return content
 460
 461     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 462         """ Returns the data of the page as a string """
 463         success = False
 464         try_count = 0
 465         while success is False:
 466             try:
 467                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 468                 success = True
 469             except compat_http_client.IncompleteRead as e:
 470                 try_count += 1
 471                 if try_count >= tries:
 472                     raise e
 473                 self._sleep(timeout, video_id)
 474         if res is False:
 475             return res
 476         else:
 477             content, _ = res
 478             return content
 479
 480     def _download_xml(self, url_or_request, video_id,
 481                       note='Downloading XML', errnote='Unable to download XML',
 482                       transform_source=None, fatal=True, encoding=None):
 483         """Return the xml as an xml.etree.ElementTree.Element"""
 484         xml_string = self._download_webpage(
 485             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 486         if xml_string is False:
 487             return xml_string
 488         if transform_source:
 489             xml_string = transform_source(xml_string)
 490         return compat_etree_fromstring(xml_string.encode('utf-8'))
 491
 492     def _download_json(self, url_or_request, video_id,
 493                        note='Downloading JSON metadata',
 494                        errnote='Unable to download JSON metadata',
 495                        transform_source=None,
 496                        fatal=True, encoding=None):
 497         json_string = self._download_webpage(
 498             url_or_request, video_id, note, errnote, fatal=fatal,
 499             encoding=encoding)
 500         if (not fatal) and json_string is False:
 501             return None
 502         return self._parse_json(
 503             json_string, video_id, transform_source=transform_source, fatal=fatal)
 504
 505     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 506         if transform_source:
 507             json_string = transform_source(json_string)
 508         try:
 509             return json.loads(json_string)
 510         except ValueError as ve:
 511             errmsg = '%s: Failed to parse JSON ' % video_id
 512             if fatal:
 513                 raise ExtractorError(errmsg, cause=ve)
 514             else:
 515                 self.report_warning(errmsg + str(ve))
 516
 517     def report_warning(self, msg, video_id=None):
 518         idstr = '' if video_id is None else '%s: ' % video_id
 519         self._downloader.report_warning(
 520             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 521
 522     def to_screen(self, msg):
 523         """Print msg to screen, prefixing it with '[ie_name]'"""
 524         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 525
 526     def report_extraction(self, id_or_name):
 527         """Report information extraction."""
 528         self.to_screen('%s: Extracting information' % id_or_name)
 529
 530     def report_download_webpage(self, video_id):
 531         """Report webpage download."""
 532         self.to_screen('%s: Downloading webpage' % video_id)
 533
 534     def report_age_confirmation(self):
 535         """Report attempt to confirm age."""
 536         self.to_screen('Confirming age')
 537
 538     def report_login(self):
 539         """Report attempt to log in."""
 540         self.to_screen('Logging in')
 541
 542     @staticmethod
 543     def raise_login_required(msg='This video is only available for registered users'):
 544         raise ExtractorError(
 545             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 546             expected=True)
 547
 548     @staticmethod
 549     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 550         raise ExtractorError(
 551             '%s. You might want to use --proxy to workaround.' % msg,
 552             expected=True)
 553
 554     # Methods for following #608
 555     @staticmethod
 556     def url_result(url, ie=None, video_id=None, video_title=None):
 557         """Returns a URL that points to a page that should be processed"""
 558         # TODO: ie should be the class used for getting the info
 559         video_info = {'_type': 'url',
 560                       'url': url,
 561                       'ie_key': ie}
 562         if video_id is not None:
 563             video_info['id'] = video_id
 564         if video_title is not None:
 565             video_info['title'] = video_title
 566         return video_info
 567
 568     @staticmethod
 569     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 570         """Returns a playlist"""
 571         video_info = {'_type': 'playlist',
 572                       'entries': entries}
 573         if playlist_id:
 574             video_info['id'] = playlist_id
 575         if playlist_title:
 576             video_info['title'] = playlist_title
 577         if playlist_description:
 578             video_info['description'] = playlist_description
 579         return video_info
 580
 581     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 582         """
 583         Perform a regex search on the given string, using a single or a list of
 584         patterns returning the first matching group.
 585         In case of failure return a default value or raise a WARNING or a
 586         RegexNotFoundError, depending on fatal, specifying the field name.
 587         """
 588         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 589             mobj = re.search(pattern, string, flags)
 590         else:
 591             for p in pattern:
 592                 mobj = re.search(p, string, flags)
 593                 if mobj:
 594                     break
 595
 596         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 597             _name = '\033[0;34m%s\033[0m' % name
 598         else:
 599             _name = name
 600
 601         if mobj:
 602             if group is None:
 603                 # return the first matching group
 604                 return next(g for g in mobj.groups() if g is not None)
 605             else:
 606                 return mobj.group(group)
 607         elif default is not NO_DEFAULT:
 608             return default
 609         elif fatal:
 610             raise RegexNotFoundError('Unable to extract %s' % _name)
 611         else:
 612             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 613             return None
 614
 615     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 616         """
 617         Like _search_regex, but strips HTML tags and unescapes entities.
 618         """
 619         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 620         if res:
 621             return clean_html(res).strip()
 622         else:
 623             return res
 624
 625     def _get_login_info(self):
 626         """
 627         Get the login info as (username, password)
 628         It will look in the netrc file using the _NETRC_MACHINE value
 629         If there's no info available, return (None, None)
 630         """
 631         if self._downloader is None:
 632             return (None, None)
 633
 634         username = None
 635         password = None
 636         downloader_params = self._downloader.params
 637
 638         # Attempt to use provided username and password or .netrc data
 639         if downloader_params.get('username', None) is not None:
 640             username = downloader_params['username']
 641             password = downloader_params['password']
 642         elif downloader_params.get('usenetrc', False):
 643             try:
 644                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 645                 if info is not None:
 646                     username = info[0]
 647                     password = info[2]
 648                 else:
 649                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 650             except (IOError, netrc.NetrcParseError) as err:
 651                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 652
 653         return (username, password)
 654
 655     def _get_tfa_info(self, note='two-factor verification code'):
 656         """
 657         Get the two-factor authentication info
 658         TODO - asking the user will be required for sms/phone verify
 659         currently just uses the command line option
 660         If there's no info available, return None
 661         """
 662         if self._downloader is None:
 663             return None
 664         downloader_params = self._downloader.params
 665
 666         if downloader_params.get('twofactor', None) is not None:
 667             return downloader_params['twofactor']
 668
 669         return compat_getpass('Type %s and press [Return]: ' % note)
 670
 671     # Helper functions for extracting OpenGraph info
 672     @staticmethod
 673     def _og_regexes(prop):
 674         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 675         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 676                        % {'prop': re.escape(prop)})
 677         template = r'<meta[^>]+?%s[^>]+?%s'
 678         return [
 679             template % (property_re, content_re),
 680             template % (content_re, property_re),
 681         ]
 682
 683     @staticmethod
 684     def _meta_regex(prop):
 685         return r'''(?isx)<meta
 686                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 687                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 688
 689     def _og_search_property(self, prop, html, name=None, **kargs):
 690         if name is None:
 691             name = 'OpenGraph %s' % prop
 692         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 693         if escaped is None:
 694             return None
 695         return unescapeHTML(escaped)
 696
 697     def _og_search_thumbnail(self, html, **kargs):
 698         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 699
 700     def _og_search_description(self, html, **kargs):
 701         return self._og_search_property('description', html, fatal=False, **kargs)
 702
 703     def _og_search_title(self, html, **kargs):
 704         return self._og_search_property('title', html, **kargs)
 705
 706     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 707         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 708         if secure:
 709             regexes = self._og_regexes('video:secure_url') + regexes
 710         return self._html_search_regex(regexes, html, name, **kargs)
 711
 712     def _og_search_url(self, html, **kargs):
 713         return self._og_search_property('url', html, **kargs)
 714
 715     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 716         if display_name is None:
 717             display_name = name
 718         return self._html_search_regex(
 719             self._meta_regex(name),
 720             html, display_name, fatal=fatal, group='content', **kwargs)
 721
 722     def _dc_search_uploader(self, html):
 723         return self._html_search_meta('dc.creator', html, 'uploader')
 724
 725     def _rta_search(self, html):
 726         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 727         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 728                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 729                      html):
 730             return 18
 731         return 0
 732
 733     def _media_rating_search(self, html):
 734         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 735         rating = self._html_search_meta('rating', html)
 736
 737         if not rating:
 738             return None
 739
 740         RATING_TABLE = {
 741             'safe for kids': 0,
 742             'general': 8,
 743             '14 years': 14,
 744             'mature': 17,
 745             'restricted': 19,
 746         }
 747         return RATING_TABLE.get(rating.lower(), None)
 748
 749     def _family_friendly_search(self, html):
 750         # See http://schema.org/VideoObject
 751         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 752
 753         if not family_friendly:
 754             return None
 755
 756         RATING_TABLE = {
 757             '1': 0,
 758             'true': 0,
 759             '0': 18,
 760             'false': 18,
 761         }
 762         return RATING_TABLE.get(family_friendly.lower(), None)
 763
 764     def _twitter_search_player(self, html):
 765         return self._html_search_meta('twitter:player', html,
 766                                       'twitter card player')
 767
 768     def _search_json_ld(self, html, video_id, **kwargs):
 769         json_ld = self._search_regex(
 770             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 771             html, 'JSON-LD', group='json_ld', **kwargs)
 772         if not json_ld:
 773             return {}
 774         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 775
 776     def _json_ld(self, json_ld, video_id, fatal=True):
 777         if isinstance(json_ld, compat_str):
 778             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 779         if not json_ld:
 780             return {}
 781         info = {}
 782         if json_ld.get('@context') == 'http://schema.org':
 783             item_type = json_ld.get('@type')
 784             if item_type == 'TVEpisode':
 785                 info.update({
 786                     'episode': unescapeHTML(json_ld.get('name')),
 787                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 788                     'description': unescapeHTML(json_ld.get('description')),
 789                 })
 790                 part_of_season = json_ld.get('partOfSeason')
 791                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 792                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 793                 part_of_series = json_ld.get('partOfSeries')
 794                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 795                     info['series'] = unescapeHTML(part_of_series.get('name'))
 796             elif item_type == 'Article':
 797                 info.update({
 798                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 799                     'title': unescapeHTML(json_ld.get('headline')),
 800                     'description': unescapeHTML(json_ld.get('articleBody')),
 801                 })
 802         return dict((k, v) for k, v in info.items() if v is not None)
 803
 804     @staticmethod
 805     def _hidden_inputs(html):
 806         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 807         hidden_inputs = {}
 808         for input in re.findall(r'(?i)<input([^>]+)>', html):
 809             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 810                 continue
 811             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 812             if not name:
 813                 continue
 814             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 815             if not value:
 816                 continue
 817             hidden_inputs[name.group('value')] = value.group('value')
 818         return hidden_inputs
 819
 820     def _form_hidden_inputs(self, form_id, html):
 821         form = self._search_regex(
 822             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 823             html, '%s form' % form_id, group='form')
 824         return self._hidden_inputs(form)
 825
 826     def _sort_formats(self, formats, field_preference=None):
 827         if not formats:
 828             raise ExtractorError('No video formats found')
 829
 830         for f in formats:
 831             # Automatically determine tbr when missing based on abr and vbr (improves
 832             # formats sorting in some cases)
 833             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 834                 f['tbr'] = f['abr'] + f['vbr']
 835
 836         def _formats_key(f):
 837             # TODO remove the following workaround
 838             from ..utils import determine_ext
 839             if not f.get('ext') and 'url' in f:
 840                 f['ext'] = determine_ext(f['url'])
 841
 842             if isinstance(field_preference, (list, tuple)):
 843                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 844
 845             preference = f.get('preference')
 846             if preference is None:
 847                 preference = 0
 848                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 849                     preference -= 0.5
 850
 851             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 852
 853             if f.get('vcodec') == 'none':  # audio only
 854                 if self._downloader.params.get('prefer_free_formats'):
 855                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 856                 else:
 857                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 858                 ext_preference = 0
 859                 try:
 860                     audio_ext_preference = ORDER.index(f['ext'])
 861                 except ValueError:
 862                     audio_ext_preference = -1
 863             else:
 864                 if self._downloader.params.get('prefer_free_formats'):
 865                     ORDER = ['flv', 'mp4', 'webm']
 866                 else:
 867                     ORDER = ['webm', 'flv', 'mp4']
 868                 try:
 869                     ext_preference = ORDER.index(f['ext'])
 870                 except ValueError:
 871                     ext_preference = -1
 872                 audio_ext_preference = 0
 873
 874             return (
 875                 preference,
 876                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 877                 f.get('quality') if f.get('quality') is not None else -1,
 878                 f.get('tbr') if f.get('tbr') is not None else -1,
 879                 f.get('filesize') if f.get('filesize') is not None else -1,
 880                 f.get('vbr') if f.get('vbr') is not None else -1,
 881                 f.get('height') if f.get('height') is not None else -1,
 882                 f.get('width') if f.get('width') is not None else -1,
 883                 proto_preference,
 884                 ext_preference,
 885                 f.get('abr') if f.get('abr') is not None else -1,
 886                 audio_ext_preference,
 887                 f.get('fps') if f.get('fps') is not None else -1,
 888                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 889                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 890                 f.get('format_id') if f.get('format_id') is not None else '',
 891             )
 892         formats.sort(key=_formats_key)
 893
 894     def _check_formats(self, formats, video_id):
 895         if formats:
 896             formats[:] = filter(
 897                 lambda f: self._is_valid_url(
 898                     f['url'], video_id,
 899                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 900                 formats)
 901
 902     def _is_valid_url(self, url, video_id, item='video'):
 903         url = self._proto_relative_url(url, scheme='http:')
 904         # For now assume non HTTP(S) URLs always valid
 905         if not (url.startswith('http://') or url.startswith('https://')):
 906             return True
 907         try:
 908             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 909             return True
 910         except ExtractorError as e:
 911             if isinstance(e.cause, compat_urllib_error.URLError):
 912                 self.to_screen(
 913                     '%s: %s URL is invalid, skipping' % (video_id, item))
 914                 return False
 915             raise
 916
 917     def http_scheme(self):
 918         """ Either "http:" or "https:", depending on the user's preferences """
 919         return (
 920             'http:'
 921             if self._downloader.params.get('prefer_insecure', False)
 922             else 'https:')
 923
 924     def _proto_relative_url(self, url, scheme=None):
 925         if url is None:
 926             return url
 927         if url.startswith('//'):
 928             if scheme is None:
 929                 scheme = self.http_scheme()
 930             return scheme + url
 931         else:
 932             return url
 933
 934     def _sleep(self, timeout, video_id, msg_template=None):
 935         if msg_template is None:
 936             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 937         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 938         self.to_screen(msg)
 939         time.sleep(timeout)
 940
 941     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 942                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 943                              fatal=True):
 944         manifest = self._download_xml(
 945             manifest_url, video_id, 'Downloading f4m manifest',
 946             'Unable to download f4m manifest',
 947             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 948             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 949             transform_source=transform_source,
 950             fatal=fatal)
 951
 952         if manifest is False:
 953             return []
 954
 955         formats = []
 956         manifest_version = '1.0'
 957         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 958         if not media_nodes:
 959             manifest_version = '2.0'
 960             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 961         base_url = xpath_text(
 962             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 963             'base URL', default=None)
 964         if base_url:
 965             base_url = base_url.strip()
 966         for i, media_el in enumerate(media_nodes):
 967             if manifest_version == '2.0':
 968                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 969                 if not media_url:
 970                     continue
 971                 manifest_url = (
 972                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 973                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 974                 # If media_url is itself a f4m manifest do the recursive extraction
 975                 # since bitrates in parent manifest (this one) and media_url manifest
 976                 # may differ leading to inability to resolve the format by requested
 977                 # bitrate in f4m downloader
 978                 if determine_ext(manifest_url) == 'f4m':
 979                     formats.extend(self._extract_f4m_formats(
 980                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 981                     continue
 982             tbr = int_or_none(media_el.attrib.get('bitrate'))
 983             formats.append({
 984                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 985                 'url': manifest_url,
 986                 'ext': 'flv',
 987                 'tbr': tbr,
 988                 'width': int_or_none(media_el.attrib.get('width')),
 989                 'height': int_or_none(media_el.attrib.get('height')),
 990                 'preference': preference,
 991             })
 992         self._sort_formats(formats)
 993
 994         return formats
 995
 996     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 997                               entry_protocol='m3u8', preference=None,
 998                               m3u8_id=None, note=None, errnote=None,
 999                               fatal=True):
1000
1001         formats = [{
1002             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1003             'url': m3u8_url,
1004             'ext': ext,
1005             'protocol': 'm3u8',
1006             'preference': preference - 1 if preference else -1,
1007             'resolution': 'multiple',
1008             'format_note': 'Quality selection URL',
1009         }]
1010
1011         format_url = lambda u: (
1012             u
1013             if re.match(r'^https?://', u)
1014             else compat_urlparse.urljoin(m3u8_url, u))
1015
1016         res = self._download_webpage_handle(
1017             m3u8_url, video_id,
1018             note=note or 'Downloading m3u8 information',
1019             errnote=errnote or 'Failed to download m3u8 information',
1020             fatal=fatal)
1021         if res is False:
1022             return []
1023         m3u8_doc, urlh = res
1024         m3u8_url = urlh.geturl()
1025         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1026         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1027         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1028         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1029         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1030             return [{
1031                 'url': m3u8_url,
1032                 'format_id': m3u8_id,
1033                 'ext': ext,
1034                 'protocol': entry_protocol,
1035                 'preference': preference,
1036             }]
1037         last_info = None
1038         last_media = None
1039         kv_rex = re.compile(
1040             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1041         for line in m3u8_doc.splitlines():
1042             if line.startswith('#EXT-X-STREAM-INF:'):
1043                 last_info = {}
1044                 for m in kv_rex.finditer(line):
1045                     v = m.group('val')
1046                     if v.startswith('"'):
1047                         v = v[1:-1]
1048                     last_info[m.group('key')] = v
1049             elif line.startswith('#EXT-X-MEDIA:'):
1050                 last_media = {}
1051                 for m in kv_rex.finditer(line):
1052                     v = m.group('val')
1053                     if v.startswith('"'):
1054                         v = v[1:-1]
1055                     last_media[m.group('key')] = v
1056             elif line.startswith('#') or not line.strip():
1057                 continue
1058             else:
1059                 if last_info is None:
1060                     formats.append({'url': format_url(line)})
1061                     continue
1062                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1063                 format_id = []
1064                 if m3u8_id:
1065                     format_id.append(m3u8_id)
1066                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1067                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1068                 f = {
1069                     'format_id': '-'.join(format_id),
1070                     'url': format_url(line.strip()),
1071                     'tbr': tbr,
1072                     'ext': ext,
1073                     'protocol': entry_protocol,
1074                     'preference': preference,
1075                 }
1076                 codecs = last_info.get('CODECS')
1077                 if codecs:
1078                     # TODO: looks like video codec is not always necessarily goes first
1079                     va_codecs = codecs.split(',')
1080                     if va_codecs[0]:
1081                         f['vcodec'] = va_codecs[0]
1082                     if len(va_codecs) > 1 and va_codecs[1]:
1083                         f['acodec'] = va_codecs[1]
1084                 resolution = last_info.get('RESOLUTION')
1085                 if resolution:
1086                     width_str, height_str = resolution.split('x')
1087                     f['width'] = int(width_str)
1088                     f['height'] = int(height_str)
1089                 if last_media is not None:
1090                     f['m3u8_media'] = last_media
1091                     last_media = None
1092                 formats.append(f)
1093                 last_info = {}
1094         self._sort_formats(formats)
1095         return formats
1096
1097     @staticmethod
1098     def _xpath_ns(path, namespace=None):
1099         if not namespace:
1100             return path
1101         out = []
1102         for c in path.split('/'):
1103             if not c or c == '.':
1104                 out.append(c)
1105             else:
1106                 out.append('{%s}%s' % (namespace, c))
1107         return '/'.join(out)
1108
1109     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1110         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1111
1112         if smil is False:
1113             assert not fatal
1114             return []
1115
1116         namespace = self._parse_smil_namespace(smil)
1117
1118         return self._parse_smil_formats(
1119             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1120
1121     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1122         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1123         if smil is False:
1124             return {}
1125         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1126
1127     def _download_smil(self, smil_url, video_id, fatal=True):
1128         return self._download_xml(
1129             smil_url, video_id, 'Downloading SMIL file',
1130             'Unable to download SMIL file', fatal=fatal)
1131
1132     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1133         namespace = self._parse_smil_namespace(smil)
1134
1135         formats = self._parse_smil_formats(
1136             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1137         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1138
1139         video_id = os.path.splitext(url_basename(smil_url))[0]
1140         title = None
1141         description = None
1142         upload_date = None
1143         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1144             name = meta.attrib.get('name')
1145             content = meta.attrib.get('content')
1146             if not name or not content:
1147                 continue
1148             if not title and name == 'title':
1149                 title = content
1150             elif not description and name in ('description', 'abstract'):
1151                 description = content
1152             elif not upload_date and name == 'date':
1153                 upload_date = unified_strdate(content)
1154
1155         thumbnails = [{
1156             'id': image.get('type'),
1157             'url': image.get('src'),
1158             'width': int_or_none(image.get('width')),
1159             'height': int_or_none(image.get('height')),
1160         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1161
1162         return {
1163             'id': video_id,
1164             'title': title or video_id,
1165             'description': description,
1166             'upload_date': upload_date,
1167             'thumbnails': thumbnails,
1168             'formats': formats,
1169             'subtitles': subtitles,
1170         }
1171
1172     def _parse_smil_namespace(self, smil):
1173         return self._search_regex(
1174             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1175
1176     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1177         base = smil_url
1178         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179             b = meta.get('base') or meta.get('httpBase')
1180             if b:
1181                 base = b
1182                 break
1183
1184         formats = []
1185         rtmp_count = 0
1186         http_count = 0
1187         m3u8_count = 0
1188
1189         videos = smil.findall(self._xpath_ns('.//video', namespace))
1190         for video in videos:
1191             src = video.get('src')
1192             if not src:
1193                 continue
1194
1195             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1196             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1197             width = int_or_none(video.get('width'))
1198             height = int_or_none(video.get('height'))
1199             proto = video.get('proto')
1200             ext = video.get('ext')
1201             src_ext = determine_ext(src)
1202             streamer = video.get('streamer') or base
1203
1204             if proto == 'rtmp' or streamer.startswith('rtmp'):
1205                 rtmp_count += 1
1206                 formats.append({
1207                     'url': streamer,
1208                     'play_path': src,
1209                     'ext': 'flv',
1210                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1211                     'tbr': bitrate,
1212                     'filesize': filesize,
1213                     'width': width,
1214                     'height': height,
1215                 })
1216                 if transform_rtmp_url:
1217                     streamer, src = transform_rtmp_url(streamer, src)
1218                     formats[-1].update({
1219                         'url': streamer,
1220                         'play_path': src,
1221                     })
1222                 continue
1223
1224             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1225
1226             if proto == 'm3u8' or src_ext == 'm3u8':
1227                 m3u8_formats = self._extract_m3u8_formats(
1228                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1229                 if len(m3u8_formats) == 1:
1230                     m3u8_count += 1
1231                     m3u8_formats[0].update({
1232                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1233                         'tbr': bitrate,
1234                         'width': width,
1235                         'height': height,
1236                     })
1237                 formats.extend(m3u8_formats)
1238                 continue
1239
1240             if src_ext == 'f4m':
1241                 f4m_url = src_url
1242                 if not f4m_params:
1243                     f4m_params = {
1244                         'hdcore': '3.2.0',
1245                         'plugin': 'flowplayer-3.2.0.1',
1246                     }
1247                 f4m_url += '&' if '?' in f4m_url else '?'
1248                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1249                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1250                 continue
1251
1252             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1253                 http_count += 1
1254                 formats.append({
1255                     'url': src_url,
1256                     'ext': ext or src_ext or 'flv',
1257                     'format_id': 'http-%d' % (bitrate or http_count),
1258                     'tbr': bitrate,
1259                     'filesize': filesize,
1260                     'width': width,
1261                     'height': height,
1262                 })
1263                 continue
1264
1265         self._sort_formats(formats)
1266
1267         return formats
1268
1269     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1270         subtitles = {}
1271         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1272             src = textstream.get('src')
1273             if not src:
1274                 continue
1275             ext = textstream.get('ext') or determine_ext(src)
1276             if not ext:
1277                 type_ = textstream.get('type')
1278                 SUBTITLES_TYPES = {
1279                     'text/vtt': 'vtt',
1280                     'text/srt': 'srt',
1281                     'application/smptett+xml': 'tt',
1282                 }
1283                 if type_ in SUBTITLES_TYPES:
1284                     ext = SUBTITLES_TYPES[type_]
1285             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1286             subtitles.setdefault(lang, []).append({
1287                 'url': src,
1288                 'ext': ext,
1289             })
1290         return subtitles
1291
1292     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1293         xspf = self._download_xml(
1294             playlist_url, playlist_id, 'Downloading xpsf playlist',
1295             'Unable to download xspf manifest', fatal=fatal)
1296         if xspf is False:
1297             return []
1298         return self._parse_xspf(xspf, playlist_id)
1299
1300     def _parse_xspf(self, playlist, playlist_id):
1301         NS_MAP = {
1302             'xspf': 'http://xspf.org/ns/0/',
1303             's1': 'http://static.streamone.nl/player/ns/0',
1304         }
1305
1306         entries = []
1307         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1308             title = xpath_text(
1309                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1310             description = xpath_text(
1311                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1312             thumbnail = xpath_text(
1313                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1314             duration = float_or_none(
1315                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1316
1317             formats = [{
1318                 'url': location.text,
1319                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1320                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1321                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1322             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1323             self._sort_formats(formats)
1324
1325             entries.append({
1326                 'id': playlist_id,
1327                 'title': title,
1328                 'description': description,
1329                 'thumbnail': thumbnail,
1330                 'duration': duration,
1331                 'formats': formats,
1332             })
1333         return entries
1334
1335     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1336         res = self._download_webpage_handle(
1337             mpd_url, video_id,
1338             note=note or 'Downloading MPD manifest',
1339             errnote=errnote or 'Failed to download MPD manifest',
1340             fatal=fatal)
1341         if res is False:
1342             return []
1343         mpd, urlh = res
1344         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1345
1346         return self._parse_mpd(
1347             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1348
1349     def _parse_mpd(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1350         if mpd_doc.get('type') == 'dynamic':
1351             return []
1352
1353         def extract_multisegment_info(element, ms_parent_info):
1354             ms_info = ms_parent_info.copy()
1355             segment_list = element.find(self._xpath_ns('SegmentList', namespace))
1356             if segment_list is not None:
1357                 segment_urls_e = segment_list.findall(self._xpath_ns('SegmentURL', namespace))
1358                 if segment_urls_e:
1359                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1360                 initialization = segment_list.find(self._xpath_ns('Initialization', namespace))
1361                 if initialization is not None:
1362                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1363             else:
1364                 segment_template = element.find(self._xpath_ns('SegmentTemplate', namespace))
1365                 if segment_template is not None:
1366                     start_number = segment_template.get('startNumber')
1367                     if start_number:
1368                         ms_info['start_number'] = int(start_number)
1369                     segment_timeline = segment_template.find(self._xpath_ns('SegmentTimeline', namespace))
1370                     if segment_timeline is not None:
1371                         s_e = segment_timeline.findall(self._xpath_ns('S', namespace))
1372                         if s_e:
1373                             ms_info['total_number'] = 0
1374                             for s in s_e:
1375                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1376                     else:
1377                         timescale = segment_template.get('timescale')
1378                         if timescale:
1379                             ms_info['timescale'] = int(timescale)
1380                         segment_duration = segment_template.get('duration')
1381                         if segment_duration:
1382                             ms_info['segment_duration'] = int(segment_duration)
1383                     media_template = segment_template.get('media')
1384                     if media_template:
1385                         ms_info['media_template'] = media_template
1386                     initialization = segment_template.get('initialization')
1387                     if initialization:
1388                         ms_info['initialization_url'] = initialization
1389                     else:
1390                         initialization = segment_template.find(self._xpath_ns('Initialization', namespace))
1391                         if initialization is not None:
1392                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1393             return ms_info
1394
1395         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace')
1396         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1397         formats = []
1398         for period in mpd_doc.findall(self._xpath_ns('Period', namespace)):
1399             period_duration = parse_duration(period.get('duration')) or mpd_duration
1400             period_ms_info = extract_multisegment_info(period, {
1401                 'start_number': 1,
1402                 'timescale': 1,
1403             })
1404             for adaptation_set in period.findall(self._xpath_ns('AdaptationSet', namespace)):
1405                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1406                 for representation in adaptation_set.findall(self._xpath_ns('Representation', namespace)):
1407                     representation_attrib = adaptation_set.attrib.copy()
1408                     representation_attrib.update(representation.attrib)
1409                     mime_type = representation_attrib.get('mimeType')
1410                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1411                     if content_type == 'text':
1412                         # TODO implement WebVTT downloading
1413                         pass
1414                     elif content_type == 'video' or content_type == 'audio':
1415                         base_url = ''
1416                         for element in (representation, adaptation_set, period, mpd_doc):
1417                             base_url_e = element.find(self._xpath_ns('BaseURL', namespace))
1418                             if base_url_e is not None:
1419                                 base_url = base_url_e.text + base_url
1420                                 if re.match(r'^https?://', base_url):
1421                                     break
1422                         if not re.match(r'^https?://', base_url):
1423                             base_url = mpd_base_url + base_url
1424                         representation_id = representation_attrib.get('id')
1425                         f = {
1426                             'format_id': mpd_id or representation_id,
1427                             'url': base_url,
1428                             'width': int_or_none(representation_attrib.get('width')),
1429                             'height': int_or_none(representation_attrib.get('height')),
1430                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1431                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1432                             'fps': int_or_none(representation_attrib.get('frameRate')),
1433                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1434                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1435                             'language': representation_attrib.get('lang'),
1436                             'format_note': 'DASH %s' % content_type,
1437                         }
1438                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1439                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1440                             if 'total_number' not in representation_ms_info and 'segment_duration':
1441                                 segment_duration = representation_ms_info['segment_duration'] / representation_ms_info['timescale']
1442                                 representation_ms_info['total_number'] = int(math.ceil(period_duration / segment_duration))
1443                             media_template = representation_ms_info['media_template']
1444                             media_template = media_template.replace('$RepresentationID$', representation_id)
1445                             media_template = re.sub(r'\$(Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1446                             media_template = media_template % {'Bandwidth': representation_attrib.get('bandwidth')}
1447                             media_template = re.sub(r'\$(Number)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1448                             media_template.replace('$$', '$')
1449                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1450                         if 'segment_urls' in representation_ms_info:
1451                             f.update({
1452                                 'segment_urls': representation_ms_info['segment_urls'],
1453                                 'protocol': 'http_dash_segments',
1454                             })
1455                             if 'initialization_url' in representation_ms_info:
1456                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1457                                 f.update({
1458                                     'initialization_url': initialization_url,
1459                                 })
1460                                 if not f.get('url'):
1461                                     f['url'] = initialization_url
1462                         try:
1463                             existing_format = next(
1464                                 fo for fo in formats
1465                                 if fo['format_id'] == representation_id)
1466                         except StopIteration:
1467                             full_info = formats_dict.get(representation_id, {}).copy()
1468                             full_info.update(f)
1469                             formats.append(full_info)
1470                         else:
1471                             existing_format.update(f)
1472                     else:
1473                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1474         return formats
1475
1476     def _live_title(self, name):
1477         """ Generate the title for a live video """
1478         now = datetime.datetime.now()
1479         now_str = now.strftime("%Y-%m-%d %H:%M")
1480         return name + ' ' + now_str
1481
1482     def _int(self, v, name, fatal=False, **kwargs):
1483         res = int_or_none(v, **kwargs)
1484         if 'get_attr' in kwargs:
1485             print(getattr(v, kwargs['get_attr']))
1486         if res is None:
1487             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1488             if fatal:
1489                 raise ExtractorError(msg)
1490             else:
1491                 self._downloader.report_warning(msg)
1492         return res
1493
1494     def _float(self, v, name, fatal=False, **kwargs):
1495         res = float_or_none(v, **kwargs)
1496         if res is None:
1497             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1498             if fatal:
1499                 raise ExtractorError(msg)
1500             else:
1501                 self._downloader.report_warning(msg)
1502         return res
1503
1504     def _set_cookie(self, domain, name, value, expire_time=None):
1505         cookie = compat_cookiejar.Cookie(
1506             0, name, value, None, None, domain, None,
1507             None, '/', True, False, expire_time, '', None, None, None)
1508         self._downloader.cookiejar.set_cookie(cookie)
1509
1510     def _get_cookies(self, url):
1511         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1512         req = sanitized_Request(url)
1513         self._downloader.cookiejar.add_cookie_header(req)
1514         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1515
1516     def get_testcases(self, include_onlymatching=False):
1517         t = getattr(self, '_TEST', None)
1518         if t:
1519             assert not hasattr(self, '_TESTS'), \
1520                 '%s has _TEST and _TESTS' % type(self).__name__
1521             tests = [t]
1522         else:
1523             tests = getattr(self, '_TESTS', [])
1524         for t in tests:
1525             if not include_onlymatching and t.get('only_matching', False):
1526                 continue
1527             t['name'] = type(self).__name__[:-len('IE')]
1528             yield t
1529
1530     def is_suitable(self, age_limit):
1531         """ Test whether the extractor is generally suitable for the given
1532         age limit (i.e. pornographic sites are not, all others usually are) """
1533
1534         any_restricted = False
1535         for tc in self.get_testcases(include_onlymatching=False):
1536             if 'playlist' in tc:
1537                 tc = tc['playlist'][0]
1538             is_restricted = age_restricted(
1539                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1540             if not is_restricted:
1541                 return True
1542             any_restricted = any_restricted or is_restricted
1543         return not any_restricted
1544
1545     def extract_subtitles(self, *args, **kwargs):
1546         if (self._downloader.params.get('writesubtitles', False) or
1547                 self._downloader.params.get('listsubtitles')):
1548             return self._get_subtitles(*args, **kwargs)
1549         return {}
1550
1551     def _get_subtitles(self, *args, **kwargs):
1552         raise NotImplementedError("This method must be implemented by subclasses")
1553
1554     @staticmethod
1555     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1556         """ Merge subtitle items for one language. Items with duplicated URLs
1557         will be dropped. """
1558         list1_urls = set([item['url'] for item in subtitle_list1])
1559         ret = list(subtitle_list1)
1560         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1561         return ret
1562
1563     @classmethod
1564     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1565         """ Merge two subtitle dictionaries, language by language. """
1566         ret = dict(subtitle_dict1)
1567         for lang in subtitle_dict2:
1568             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1569         return ret
1570
1571     def extract_automatic_captions(self, *args, **kwargs):
1572         if (self._downloader.params.get('writeautomaticsub', False) or
1573                 self._downloader.params.get('listsubtitles')):
1574             return self._get_automatic_captions(*args, **kwargs)
1575         return {}
1576
1577     def _get_automatic_captions(self, *args, **kwargs):
1578         raise NotImplementedError("This method must be implemented by subclasses")
1579
1580
1581 class SearchInfoExtractor(InfoExtractor):
1582     """
1583     Base class for paged search queries extractors.
1584     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1585     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1586     """
1587
1588     @classmethod
1589     def _make_valid_url(cls):
1590         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1591
1592     @classmethod
1593     def suitable(cls, url):
1594         return re.match(cls._make_valid_url(), url) is not None
1595
1596     def _real_extract(self, query):
1597         mobj = re.match(self._make_valid_url(), query)
1598         if mobj is None:
1599             raise ExtractorError('Invalid search query "%s"' % query)
1600
1601         prefix = mobj.group('prefix')
1602         query = mobj.group('query')
1603         if prefix == '':
1604             return self._get_n_results(query, 1)
1605         elif prefix == 'all':
1606             return self._get_n_results(query, self._MAX_RESULTS)
1607         else:
1608             n = int(prefix)
1609             if n <= 0:
1610                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1611             elif n > self._MAX_RESULTS:
1612                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1613                 n = self._MAX_RESULTS
1614             return self._get_n_results(query, n)
1615
1616     def _get_n_results(self, query, n):
1617         """Get a specified number of results for a query"""
1618         raise NotImplementedError("This method must be implemented by subclasses")
1619
1620     @property
1621     def SEARCH_KEY(self):
1622         return self._SEARCH_KEY