_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49 )
  50
  51
  52 class InfoExtractor(object):
  53     """Information Extractor class.
  54
  55     Information extractors are the classes that, given a URL, extract
  56     information about the video (or videos) the URL refers to. This
  57     information includes the real video URL, the video title, author and
  58     others. The information is stored in a dictionary which is then
  59     passed to the YoutubeDL. The YoutubeDL processes this
  60     information possibly downloading the video to the file system, among
  61     other possible outcomes.
  62
  63     The type field determines the type of the result.
  64     By far the most common value (and the default if _type is missing) is
  65     "video", which indicates a single video.
  66
  67     For a video, the dictionaries must include the following fields:
  68
  69     id:             Video identifier.
  70     title:          Video title, unescaped.
  71
  72     Additionally, it must contain either a formats entry or a url one:
  73
  74     formats:        A list of dictionaries for each format available, ordered
  75                     from worst to best quality.
  76
  77                     Potential fields:
  78                     * url        Mandatory. The URL of the video file
  79                     * ext        Will be calculated from URL if missing
  80                     * format     A human-readable description of the format
  81                                  ("mp4 container with h264/opus").
  82                                  Calculated from the format_id, width, height.
  83                                  and format_note fields if missing.
  84                     * format_id  A short description of the format
  85                                  ("mp4_h264_opus" or "19").
  86                                 Technically optional, but strongly recommended.
  87                     * format_note Additional info about the format
  88                                  ("3D" or "DASH video")
  89                     * width      Width of the video, if known
  90                     * height     Height of the video, if known
  91                     * resolution Textual description of width and height
  92                     * tbr        Average bitrate of audio and video in KBit/s
  93                     * abr        Average audio bitrate in KBit/s
  94                     * acodec     Name of the audio codec in use
  95                     * asr        Audio sampling rate in Hertz
  96                     * vbr        Average video bitrate in KBit/s
  97                     * fps        Frame rate
  98                     * vcodec     Name of the video codec in use
  99                     * container  Name of the container format
 100                     * filesize   The number of bytes, if known in advance
 101                     * filesize_approx  An estimate for the number of bytes
 102                     * player_url SWF Player URL (used for rtmpdump).
 103                     * protocol   The protocol that will be used for the actual
 104                                  download, lower-case.
 105                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 106                                  "m3u8", or "m3u8_native".
 107                     * preference Order number of this format. If this field is
 108                                  present and not None, the formats get sorted
 109                                  by this field, regardless of all other values.
 110                                  -1 for default (order by other properties),
 111                                  -2 or smaller for less than default.
 112                                  < -1000 to hide the format (if there is
 113                                     another one which is strictly better)
 114                     * language   Language code, e.g. "de" or "en-US".
 115                     * language_preference  Is this in the language mentioned in
 116                                  the URL?
 117                                  10 if it's what the URL is about,
 118                                  -1 for default (don't know),
 119                                  -10 otherwise, other values reserved for now.
 120                     * quality    Order number of the video quality of this
 121                                  format, irrespective of the file format.
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * source_preference  Order number for this video source
 125                                   (quality takes higher priority)
 126                                  -1 for default (order by other properties),
 127                                  -2 or smaller for less than default.
 128                     * http_headers  A dictionary of additional HTTP headers
 129                                  to add to the request.
 130                     * stretched_ratio  If given and not 1, indicates that the
 131                                  video's pixels are not square.
 132                                  width : height ratio as float.
 133                     * no_resume  The server does not support resuming the
 134                                  (HTTP or RTMP) download. Boolean.
 135
 136     url:            Final video URL.
 137     ext:            Video filename extension.
 138     format:         The video format, defaults to ext (used for --get-format)
 139     player_url:     SWF Player URL (used for rtmpdump).
 140
 141     The following fields are optional:
 142
 143     alt_title:      A secondary title of the video.
 144     display_id      An alternative identifier for the video, not necessarily
 145                     unique, but available before title. Typically, id is
 146                     something like "4234987", title "Dancing naked mole rats",
 147                     and display_id "dancing-naked-mole-rats"
 148     thumbnails:     A list of dictionaries, with the following entries:
 149                         * "id" (optional, string) - Thumbnail format ID
 150                         * "url"
 151                         * "preference" (optional, int) - quality of the image
 152                         * "width" (optional, int)
 153                         * "height" (optional, int)
 154                         * "resolution" (optional, string "{width}x{height"},
 155                                         deprecated)
 156     thumbnail:      Full URL to a video thumbnail image.
 157     description:    Full video description.
 158     uploader:       Full name of the video uploader.
 159     creator:        The main artist who created the video.
 160     release_date:   The date (YYYYMMDD) when the video was released.
 161     timestamp:      UNIX timestamp of the moment the video became available.
 162     upload_date:    Video upload date (YYYYMMDD).
 163                     If not explicitly set, calculated from timestamp.
 164     uploader_id:    Nickname or id of the video uploader.
 165     location:       Physical location where the video was filmed.
 166     subtitles:      The available subtitles as a dictionary in the format
 167                     {language: subformats}. "subformats" is a list sorted from
 168                     lower to higher preference, each element is a dictionary
 169                     with the "ext" entry and one of:
 170                         * "data": The subtitles file contents
 171                         * "url": A URL pointing to the subtitles file
 172                     "ext" will be calculated from URL if missing
 173     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 174                     automatically generated captions
 175     duration:       Length of the video in seconds, as an integer or float.
 176     view_count:     How many users have watched the video on the platform.
 177     like_count:     Number of positive ratings of the video
 178     dislike_count:  Number of negative ratings of the video
 179     repost_count:   Number of reposts of the video
 180     average_rating: Average rating give by users, the scale used depends on the webpage
 181     comment_count:  Number of comments on the video
 182     comments:       A list of comments, each with one or more of the following
 183                     properties (all but one of text or html optional):
 184                         * "author" - human-readable name of the comment author
 185                         * "author_id" - user ID of the comment author
 186                         * "id" - Comment ID
 187                         * "html" - Comment as HTML
 188                         * "text" - Plain text of the comment
 189                         * "timestamp" - UNIX timestamp of comment
 190                         * "parent" - ID of the comment this one is replying to.
 191                                      Set to "root" to indicate that this is a
 192                                      comment to the original video.
 193     age_limit:      Age restriction for the video, as an integer (years)
 194     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 195                     should allow to get the same result again. (It will be set
 196                     by YoutubeDL if it's missing)
 197     categories:     A list of categories that the video falls in, for example
 198                     ["Sports", "Berlin"]
 199     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 200     is_live:        True, False, or None (=unknown). Whether this video is a
 201                     live stream that goes on instead of a fixed-length video.
 202     start_time:     Time in seconds where the reproduction should start, as
 203                     specified in the URL.
 204     end_time:       Time in seconds where the reproduction should end, as
 205                     specified in the URL.
 206
 207     The following fields should only be used when the video belongs to some logical
 208     chapter or section:
 209
 210     chapter:        Name or title of the chapter the video belongs to.
 211     chapter_number: Number of the chapter the video belongs to, as an integer.
 212     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 213
 214     The following fields should only be used when the video is an episode of some
 215     series or programme:
 216
 217     series:         Title of the series or programme the video episode belongs to.
 218     season:         Title of the season the video episode belongs to.
 219     season_number:  Number of the season the video episode belongs to, as an integer.
 220     season_id:      Id of the season the video episode belongs to, as a unicode string.
 221     episode:        Title of the video episode. Unlike mandatory video title field,
 222                     this field should denote the exact title of the video episode
 223                     without any kind of decoration.
 224     episode_number: Number of the video episode within a season, as an integer.
 225     episode_id:     Id of the video episode, as a unicode string.
 226
 227     Unless mentioned otherwise, the fields should be Unicode strings.
 228
 229     Unless mentioned otherwise, None is equivalent to absence of information.
 230
 231
 232     _type "playlist" indicates multiple videos.
 233     There must be a key "entries", which is a list, an iterable, or a PagedList
 234     object, each element of which is a valid dictionary by this specification.
 235
 236     Additionally, playlists can have "title", "description" and "id" attributes
 237     with the same semantics as videos (see above).
 238
 239
 240     _type "multi_video" indicates that there are multiple videos that
 241     form a single show, for examples multiple acts of an opera or TV episode.
 242     It must have an entries key like a playlist and contain all the keys
 243     required for a video at the same time.
 244
 245
 246     _type "url" indicates that the video must be extracted from another
 247     location, possibly by a different extractor. Its only required key is:
 248     "url" - the next URL to extract.
 249     The key "ie_key" can be set to the class name (minus the trailing "IE",
 250     e.g. "Youtube") if the extractor class is known in advance.
 251     Additionally, the dictionary may have any properties of the resolved entity
 252     known in advance, for example "title" if the title of the referred video is
 253     known ahead of time.
 254
 255
 256     _type "url_transparent" entities have the same specification as "url", but
 257     indicate that the given additional information is more precise than the one
 258     associated with the resolved URL.
 259     This is useful when a site employs a video service that hosts the video and
 260     its technical metadata, but that video service does not embed a useful
 261     title, description etc.
 262
 263
 264     Subclasses of this one should re-define the _real_initialize() and
 265     _real_extract() methods and define a _VALID_URL regexp.
 266     Probably, they should also be added to the list of extractors.
 267
 268     Finally, the _WORKING attribute should be set to False for broken IEs
 269     in order to warn the users and skip the tests.
 270     """
 271
 272     _ready = False
 273     _downloader = None
 274     _WORKING = True
 275
 276     def __init__(self, downloader=None):
 277         """Constructor. Receives an optional downloader."""
 278         self._ready = False
 279         self.set_downloader(downloader)
 280
 281     @classmethod
 282     def suitable(cls, url):
 283         """Receives a URL and returns True if suitable for this IE."""
 284
 285         # This does not use has/getattr intentionally - we want to know whether
 286         # we have cached the regexp for *this* class, whereas getattr would also
 287         # match the superclass
 288         if '_VALID_URL_RE' not in cls.__dict__:
 289             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 290         return cls._VALID_URL_RE.match(url) is not None
 291
 292     @classmethod
 293     def _match_id(cls, url):
 294         if '_VALID_URL_RE' not in cls.__dict__:
 295             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 296         m = cls._VALID_URL_RE.match(url)
 297         assert m
 298         return m.group('id')
 299
 300     @classmethod
 301     def working(cls):
 302         """Getter method for _WORKING."""
 303         return cls._WORKING
 304
 305     def initialize(self):
 306         """Initializes an instance (authentication, etc)."""
 307         if not self._ready:
 308             self._real_initialize()
 309             self._ready = True
 310
 311     def extract(self, url):
 312         """Extracts URL information and returns it in list of dicts."""
 313         try:
 314             self.initialize()
 315             return self._real_extract(url)
 316         except ExtractorError:
 317             raise
 318         except compat_http_client.IncompleteRead as e:
 319             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 320         except (KeyError, StopIteration) as e:
 321             raise ExtractorError('An extractor error has occurred.', cause=e)
 322
 323     def set_downloader(self, downloader):
 324         """Sets the downloader for this IE."""
 325         self._downloader = downloader
 326
 327     def _real_initialize(self):
 328         """Real initialization process. Redefine in subclasses."""
 329         pass
 330
 331     def _real_extract(self, url):
 332         """Real extraction process. Redefine in subclasses."""
 333         pass
 334
 335     @classmethod
 336     def ie_key(cls):
 337         """A string for getting the InfoExtractor with get_info_extractor"""
 338         return compat_str(cls.__name__[:-2])
 339
 340     @property
 341     def IE_NAME(self):
 342         return compat_str(type(self).__name__[:-2])
 343
 344     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 345         """ Returns the response handle """
 346         if note is None:
 347             self.report_download_webpage(video_id)
 348         elif note is not False:
 349             if video_id is None:
 350                 self.to_screen('%s' % (note,))
 351             else:
 352                 self.to_screen('%s: %s' % (video_id, note))
 353         try:
 354             return self._downloader.urlopen(url_or_request)
 355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 356             if errnote is False:
 357                 return False
 358             if errnote is None:
 359                 errnote = 'Unable to download webpage'
 360
 361             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 362             if fatal:
 363                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 364             else:
 365                 self._downloader.report_warning(errmsg)
 366                 return False
 367
 368     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 369         """ Returns a tuple (page content as string, URL handle) """
 370         # Strip hashes from the URL (#1038)
 371         if isinstance(url_or_request, (compat_str, str)):
 372             url_or_request = url_or_request.partition('#')[0]
 373
 374         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 375         if urlh is False:
 376             assert not fatal
 377             return False
 378         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 379         return (content, urlh)
 380
 381     @staticmethod
 382     def _guess_encoding_from_content(content_type, webpage_bytes):
 383         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 384         if m:
 385             encoding = m.group(1)
 386         else:
 387             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 388                           webpage_bytes[:1024])
 389             if m:
 390                 encoding = m.group(1).decode('ascii')
 391             elif webpage_bytes.startswith(b'\xff\xfe'):
 392                 encoding = 'utf-16'
 393             else:
 394                 encoding = 'utf-8'
 395
 396         return encoding
 397
 398     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 399         content_type = urlh.headers.get('Content-Type', '')
 400         webpage_bytes = urlh.read()
 401         if prefix is not None:
 402             webpage_bytes = prefix + webpage_bytes
 403         if not encoding:
 404             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 405         if self._downloader.params.get('dump_intermediate_pages', False):
 406             try:
 407                 url = url_or_request.get_full_url()
 408             except AttributeError:
 409                 url = url_or_request
 410             self.to_screen('Dumping request to ' + url)
 411             dump = base64.b64encode(webpage_bytes).decode('ascii')
 412             self._downloader.to_screen(dump)
 413         if self._downloader.params.get('write_pages', False):
 414             try:
 415                 url = url_or_request.get_full_url()
 416             except AttributeError:
 417                 url = url_or_request
 418             basen = '%s_%s' % (video_id, url)
 419             if len(basen) > 240:
 420                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 421                 basen = basen[:240 - len(h)] + h
 422             raw_filename = basen + '.dump'
 423             filename = sanitize_filename(raw_filename, restricted=True)
 424             self.to_screen('Saving request to ' + filename)
 425             # Working around MAX_PATH limitation on Windows (see
 426             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 427             if os.name == 'nt':
 428                 absfilepath = os.path.abspath(filename)
 429                 if len(absfilepath) > 259:
 430                     filename = '\\\\?\\' + absfilepath
 431             with open(filename, 'wb') as outf:
 432                 outf.write(webpage_bytes)
 433
 434         try:
 435             content = webpage_bytes.decode(encoding, 'replace')
 436         except LookupError:
 437             content = webpage_bytes.decode('utf-8', 'replace')
 438
 439         if ('<title>Access to this site is blocked</title>' in content and
 440                 'Websense' in content[:512]):
 441             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 442             blocked_iframe = self._html_search_regex(
 443                 r'<iframe src="([^"]+)"', content,
 444                 'Websense information URL', default=None)
 445             if blocked_iframe:
 446                 msg += ' Visit %s for more details' % blocked_iframe
 447             raise ExtractorError(msg, expected=True)
 448         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 449             msg = (
 450                 'Access to this webpage has been blocked by Indian censorship. '
 451                 'Use a VPN or proxy server (with --proxy) to route around it.')
 452             block_msg = self._html_search_regex(
 453                 r'</h1><p>(.*?)</p>',
 454                 content, 'block message', default=None)
 455             if block_msg:
 456                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 457             raise ExtractorError(msg, expected=True)
 458
 459         return content
 460
 461     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 462         """ Returns the data of the page as a string """
 463         success = False
 464         try_count = 0
 465         while success is False:
 466             try:
 467                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 468                 success = True
 469             except compat_http_client.IncompleteRead as e:
 470                 try_count += 1
 471                 if try_count >= tries:
 472                     raise e
 473                 self._sleep(timeout, video_id)
 474         if res is False:
 475             return res
 476         else:
 477             content, _ = res
 478             return content
 479
 480     def _download_xml(self, url_or_request, video_id,
 481                       note='Downloading XML', errnote='Unable to download XML',
 482                       transform_source=None, fatal=True, encoding=None):
 483         """Return the xml as an xml.etree.ElementTree.Element"""
 484         xml_string = self._download_webpage(
 485             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 486         if xml_string is False:
 487             return xml_string
 488         if transform_source:
 489             xml_string = transform_source(xml_string)
 490         return compat_etree_fromstring(xml_string.encode('utf-8'))
 491
 492     def _download_json(self, url_or_request, video_id,
 493                        note='Downloading JSON metadata',
 494                        errnote='Unable to download JSON metadata',
 495                        transform_source=None,
 496                        fatal=True, encoding=None):
 497         json_string = self._download_webpage(
 498             url_or_request, video_id, note, errnote, fatal=fatal,
 499             encoding=encoding)
 500         if (not fatal) and json_string is False:
 501             return None
 502         return self._parse_json(
 503             json_string, video_id, transform_source=transform_source, fatal=fatal)
 504
 505     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 506         if transform_source:
 507             json_string = transform_source(json_string)
 508         try:
 509             return json.loads(json_string)
 510         except ValueError as ve:
 511             errmsg = '%s: Failed to parse JSON ' % video_id
 512             if fatal:
 513                 raise ExtractorError(errmsg, cause=ve)
 514             else:
 515                 self.report_warning(errmsg + str(ve))
 516
 517     def report_warning(self, msg, video_id=None):
 518         idstr = '' if video_id is None else '%s: ' % video_id
 519         self._downloader.report_warning(
 520             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 521
 522     def to_screen(self, msg):
 523         """Print msg to screen, prefixing it with '[ie_name]'"""
 524         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 525
 526     def report_extraction(self, id_or_name):
 527         """Report information extraction."""
 528         self.to_screen('%s: Extracting information' % id_or_name)
 529
 530     def report_download_webpage(self, video_id):
 531         """Report webpage download."""
 532         self.to_screen('%s: Downloading webpage' % video_id)
 533
 534     def report_age_confirmation(self):
 535         """Report attempt to confirm age."""
 536         self.to_screen('Confirming age')
 537
 538     def report_login(self):
 539         """Report attempt to log in."""
 540         self.to_screen('Logging in')
 541
 542     @staticmethod
 543     def raise_login_required(msg='This video is only available for registered users'):
 544         raise ExtractorError(
 545             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 546             expected=True)
 547
 548     @staticmethod
 549     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 550         raise ExtractorError(
 551             '%s. You might want to use --proxy to workaround.' % msg,
 552             expected=True)
 553
 554     # Methods for following #608
 555     @staticmethod
 556     def url_result(url, ie=None, video_id=None, video_title=None):
 557         """Returns a URL that points to a page that should be processed"""
 558         # TODO: ie should be the class used for getting the info
 559         video_info = {'_type': 'url',
 560                       'url': url,
 561                       'ie_key': ie}
 562         if video_id is not None:
 563             video_info['id'] = video_id
 564         if video_title is not None:
 565             video_info['title'] = video_title
 566         return video_info
 567
 568     @staticmethod
 569     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 570         """Returns a playlist"""
 571         video_info = {'_type': 'playlist',
 572                       'entries': entries}
 573         if playlist_id:
 574             video_info['id'] = playlist_id
 575         if playlist_title:
 576             video_info['title'] = playlist_title
 577         if playlist_description:
 578             video_info['description'] = playlist_description
 579         return video_info
 580
 581     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 582         """
 583         Perform a regex search on the given string, using a single or a list of
 584         patterns returning the first matching group.
 585         In case of failure return a default value or raise a WARNING or a
 586         RegexNotFoundError, depending on fatal, specifying the field name.
 587         """
 588         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 589             mobj = re.search(pattern, string, flags)
 590         else:
 591             for p in pattern:
 592                 mobj = re.search(p, string, flags)
 593                 if mobj:
 594                     break
 595
 596         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 597             _name = '\033[0;34m%s\033[0m' % name
 598         else:
 599             _name = name
 600
 601         if mobj:
 602             if group is None:
 603                 # return the first matching group
 604                 return next(g for g in mobj.groups() if g is not None)
 605             else:
 606                 return mobj.group(group)
 607         elif default is not NO_DEFAULT:
 608             return default
 609         elif fatal:
 610             raise RegexNotFoundError('Unable to extract %s' % _name)
 611         else:
 612             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 613             return None
 614
 615     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 616         """
 617         Like _search_regex, but strips HTML tags and unescapes entities.
 618         """
 619         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 620         if res:
 621             return clean_html(res).strip()
 622         else:
 623             return res
 624
 625     def _get_login_info(self):
 626         """
 627         Get the login info as (username, password)
 628         It will look in the netrc file using the _NETRC_MACHINE value
 629         If there's no info available, return (None, None)
 630         """
 631         if self._downloader is None:
 632             return (None, None)
 633
 634         username = None
 635         password = None
 636         downloader_params = self._downloader.params
 637
 638         # Attempt to use provided username and password or .netrc data
 639         if downloader_params.get('username', None) is not None:
 640             username = downloader_params['username']
 641             password = downloader_params['password']
 642         elif downloader_params.get('usenetrc', False):
 643             try:
 644                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 645                 if info is not None:
 646                     username = info[0]
 647                     password = info[2]
 648                 else:
 649                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 650             except (IOError, netrc.NetrcParseError) as err:
 651                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 652
 653         return (username, password)
 654
 655     def _get_tfa_info(self, note='two-factor verification code'):
 656         """
 657         Get the two-factor authentication info
 658         TODO - asking the user will be required for sms/phone verify
 659         currently just uses the command line option
 660         If there's no info available, return None
 661         """
 662         if self._downloader is None:
 663             return None
 664         downloader_params = self._downloader.params
 665
 666         if downloader_params.get('twofactor', None) is not None:
 667             return downloader_params['twofactor']
 668
 669         return compat_getpass('Type %s and press [Return]: ' % note)
 670
 671     # Helper functions for extracting OpenGraph info
 672     @staticmethod
 673     def _og_regexes(prop):
 674         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 675         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 676                        % {'prop': re.escape(prop)})
 677         template = r'<meta[^>]+?%s[^>]+?%s'
 678         return [
 679             template % (property_re, content_re),
 680             template % (content_re, property_re),
 681         ]
 682
 683     @staticmethod
 684     def _meta_regex(prop):
 685         return r'''(?isx)<meta
 686                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 687                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 688
 689     def _og_search_property(self, prop, html, name=None, **kargs):
 690         if name is None:
 691             name = 'OpenGraph %s' % prop
 692         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 693         if escaped is None:
 694             return None
 695         return unescapeHTML(escaped)
 696
 697     def _og_search_thumbnail(self, html, **kargs):
 698         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 699
 700     def _og_search_description(self, html, **kargs):
 701         return self._og_search_property('description', html, fatal=False, **kargs)
 702
 703     def _og_search_title(self, html, **kargs):
 704         return self._og_search_property('title', html, **kargs)
 705
 706     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 707         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 708         if secure:
 709             regexes = self._og_regexes('video:secure_url') + regexes
 710         return self._html_search_regex(regexes, html, name, **kargs)
 711
 712     def _og_search_url(self, html, **kargs):
 713         return self._og_search_property('url', html, **kargs)
 714
 715     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 716         if display_name is None:
 717             display_name = name
 718         return self._html_search_regex(
 719             self._meta_regex(name),
 720             html, display_name, fatal=fatal, group='content', **kwargs)
 721
 722     def _dc_search_uploader(self, html):
 723         return self._html_search_meta('dc.creator', html, 'uploader')
 724
 725     def _rta_search(self, html):
 726         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 727         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 728                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 729                      html):
 730             return 18
 731         return 0
 732
 733     def _media_rating_search(self, html):
 734         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 735         rating = self._html_search_meta('rating', html)
 736
 737         if not rating:
 738             return None
 739
 740         RATING_TABLE = {
 741             'safe for kids': 0,
 742             'general': 8,
 743             '14 years': 14,
 744             'mature': 17,
 745             'restricted': 19,
 746         }
 747         return RATING_TABLE.get(rating.lower(), None)
 748
 749     def _family_friendly_search(self, html):
 750         # See http://schema.org/VideoObject
 751         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 752
 753         if not family_friendly:
 754             return None
 755
 756         RATING_TABLE = {
 757             '1': 0,
 758             'true': 0,
 759             '0': 18,
 760             'false': 18,
 761         }
 762         return RATING_TABLE.get(family_friendly.lower(), None)
 763
 764     def _twitter_search_player(self, html):
 765         return self._html_search_meta('twitter:player', html,
 766                                       'twitter card player')
 767
 768     def _search_json_ld(self, html, video_id, **kwargs):
 769         json_ld = self._search_regex(
 770             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 771             html, 'JSON-LD', group='json_ld', **kwargs)
 772         if not json_ld:
 773             return {}
 774         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 775
 776     def _json_ld(self, json_ld, video_id, fatal=True):
 777         if isinstance(json_ld, compat_str):
 778             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 779         if not json_ld:
 780             return {}
 781         info = {}
 782         if json_ld.get('@context') == 'http://schema.org':
 783             item_type = json_ld.get('@type')
 784             if item_type == 'TVEpisode':
 785                 info.update({
 786                     'episode': unescapeHTML(json_ld.get('name')),
 787                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 788                     'description': unescapeHTML(json_ld.get('description')),
 789                 })
 790                 part_of_season = json_ld.get('partOfSeason')
 791                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 792                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 793                 part_of_series = json_ld.get('partOfSeries')
 794                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 795                     info['series'] = unescapeHTML(part_of_series.get('name'))
 796             elif item_type == 'Article':
 797                 info.update({
 798                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 799                     'title': unescapeHTML(json_ld.get('headline')),
 800                     'description': unescapeHTML(json_ld.get('articleBody')),
 801                 })
 802         return dict((k, v) for k, v in info.items() if v is not None)
 803
 804     @staticmethod
 805     def _hidden_inputs(html):
 806         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 807         hidden_inputs = {}
 808         for input in re.findall(r'(?i)<input([^>]+)>', html):
 809             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 810                 continue
 811             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 812             if not name:
 813                 continue
 814             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 815             if not value:
 816                 continue
 817             hidden_inputs[name.group('value')] = value.group('value')
 818         return hidden_inputs
 819
 820     def _form_hidden_inputs(self, form_id, html):
 821         form = self._search_regex(
 822             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 823             html, '%s form' % form_id, group='form')
 824         return self._hidden_inputs(form)
 825
 826     def _sort_formats(self, formats, field_preference=None):
 827         if not formats:
 828             raise ExtractorError('No video formats found')
 829
 830         for f in formats:
 831             # Automatically determine tbr when missing based on abr and vbr (improves
 832             # formats sorting in some cases)
 833             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 834                 f['tbr'] = f['abr'] + f['vbr']
 835
 836         def _formats_key(f):
 837             # TODO remove the following workaround
 838             from ..utils import determine_ext
 839             if not f.get('ext') and 'url' in f:
 840                 f['ext'] = determine_ext(f['url'])
 841
 842             if isinstance(field_preference, (list, tuple)):
 843                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 844
 845             preference = f.get('preference')
 846             if preference is None:
 847                 preference = 0
 848                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 849                     preference -= 0.5
 850
 851             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 852
 853             if f.get('vcodec') == 'none':  # audio only
 854                 preference -= 50
 855                 if self._downloader.params.get('prefer_free_formats'):
 856                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 857                 else:
 858                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 859                 ext_preference = 0
 860                 try:
 861                     audio_ext_preference = ORDER.index(f['ext'])
 862                 except ValueError:
 863                     audio_ext_preference = -1
 864             else:
 865                 if f.get('acodec') == 'none':  # video only
 866                     preference -= 40
 867                 if self._downloader.params.get('prefer_free_formats'):
 868                     ORDER = ['flv', 'mp4', 'webm']
 869                 else:
 870                     ORDER = ['webm', 'flv', 'mp4']
 871                 try:
 872                     ext_preference = ORDER.index(f['ext'])
 873                 except ValueError:
 874                     ext_preference = -1
 875                 audio_ext_preference = 0
 876
 877             return (
 878                 preference,
 879                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 880                 f.get('quality') if f.get('quality') is not None else -1,
 881                 f.get('tbr') if f.get('tbr') is not None else -1,
 882                 f.get('filesize') if f.get('filesize') is not None else -1,
 883                 f.get('vbr') if f.get('vbr') is not None else -1,
 884                 f.get('height') if f.get('height') is not None else -1,
 885                 f.get('width') if f.get('width') is not None else -1,
 886                 proto_preference,
 887                 ext_preference,
 888                 f.get('abr') if f.get('abr') is not None else -1,
 889                 audio_ext_preference,
 890                 f.get('fps') if f.get('fps') is not None else -1,
 891                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 892                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 893                 f.get('format_id') if f.get('format_id') is not None else '',
 894             )
 895         formats.sort(key=_formats_key)
 896
 897     def _check_formats(self, formats, video_id):
 898         if formats:
 899             formats[:] = filter(
 900                 lambda f: self._is_valid_url(
 901                     f['url'], video_id,
 902                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 903                 formats)
 904
 905     def _is_valid_url(self, url, video_id, item='video'):
 906         url = self._proto_relative_url(url, scheme='http:')
 907         # For now assume non HTTP(S) URLs always valid
 908         if not (url.startswith('http://') or url.startswith('https://')):
 909             return True
 910         try:
 911             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 912             return True
 913         except ExtractorError as e:
 914             if isinstance(e.cause, compat_urllib_error.URLError):
 915                 self.to_screen(
 916                     '%s: %s URL is invalid, skipping' % (video_id, item))
 917                 return False
 918             raise
 919
 920     def http_scheme(self):
 921         """ Either "http:" or "https:", depending on the user's preferences """
 922         return (
 923             'http:'
 924             if self._downloader.params.get('prefer_insecure', False)
 925             else 'https:')
 926
 927     def _proto_relative_url(self, url, scheme=None):
 928         if url is None:
 929             return url
 930         if url.startswith('//'):
 931             if scheme is None:
 932                 scheme = self.http_scheme()
 933             return scheme + url
 934         else:
 935             return url
 936
 937     def _sleep(self, timeout, video_id, msg_template=None):
 938         if msg_template is None:
 939             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 940         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 941         self.to_screen(msg)
 942         time.sleep(timeout)
 943
 944     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 945                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 946                              fatal=True):
 947         manifest = self._download_xml(
 948             manifest_url, video_id, 'Downloading f4m manifest',
 949             'Unable to download f4m manifest',
 950             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 951             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 952             transform_source=transform_source,
 953             fatal=fatal)
 954
 955         if manifest is False:
 956             return []
 957
 958         formats = []
 959         manifest_version = '1.0'
 960         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 961         if not media_nodes:
 962             manifest_version = '2.0'
 963             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 964         base_url = xpath_text(
 965             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 966             'base URL', default=None)
 967         if base_url:
 968             base_url = base_url.strip()
 969         for i, media_el in enumerate(media_nodes):
 970             if manifest_version == '2.0':
 971                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 972                 if not media_url:
 973                     continue
 974                 manifest_url = (
 975                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 976                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 977                 # If media_url is itself a f4m manifest do the recursive extraction
 978                 # since bitrates in parent manifest (this one) and media_url manifest
 979                 # may differ leading to inability to resolve the format by requested
 980                 # bitrate in f4m downloader
 981                 if determine_ext(manifest_url) == 'f4m':
 982                     formats.extend(self._extract_f4m_formats(
 983                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 984                     continue
 985             tbr = int_or_none(media_el.attrib.get('bitrate'))
 986             formats.append({
 987                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 988                 'url': manifest_url,
 989                 'ext': 'flv',
 990                 'tbr': tbr,
 991                 'width': int_or_none(media_el.attrib.get('width')),
 992                 'height': int_or_none(media_el.attrib.get('height')),
 993                 'preference': preference,
 994             })
 995         self._sort_formats(formats)
 996
 997         return formats
 998
 999     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1000                               entry_protocol='m3u8', preference=None,
1001                               m3u8_id=None, note=None, errnote=None,
1002                               fatal=True):
1003
1004         formats = [{
1005             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1006             'url': m3u8_url,
1007             'ext': ext,
1008             'protocol': 'm3u8',
1009             'preference': preference - 1 if preference else -1,
1010             'resolution': 'multiple',
1011             'format_note': 'Quality selection URL',
1012         }]
1013
1014         format_url = lambda u: (
1015             u
1016             if re.match(r'^https?://', u)
1017             else compat_urlparse.urljoin(m3u8_url, u))
1018
1019         res = self._download_webpage_handle(
1020             m3u8_url, video_id,
1021             note=note or 'Downloading m3u8 information',
1022             errnote=errnote or 'Failed to download m3u8 information',
1023             fatal=fatal)
1024         if res is False:
1025             return []
1026         m3u8_doc, urlh = res
1027         m3u8_url = urlh.geturl()
1028         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1029         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1030         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1031         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1032         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1033             return [{
1034                 'url': m3u8_url,
1035                 'format_id': m3u8_id,
1036                 'ext': ext,
1037                 'protocol': entry_protocol,
1038                 'preference': preference,
1039             }]
1040         last_info = None
1041         last_media = None
1042         kv_rex = re.compile(
1043             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1044         for line in m3u8_doc.splitlines():
1045             if line.startswith('#EXT-X-STREAM-INF:'):
1046                 last_info = {}
1047                 for m in kv_rex.finditer(line):
1048                     v = m.group('val')
1049                     if v.startswith('"'):
1050                         v = v[1:-1]
1051                     last_info[m.group('key')] = v
1052             elif line.startswith('#EXT-X-MEDIA:'):
1053                 last_media = {}
1054                 for m in kv_rex.finditer(line):
1055                     v = m.group('val')
1056                     if v.startswith('"'):
1057                         v = v[1:-1]
1058                     last_media[m.group('key')] = v
1059             elif line.startswith('#') or not line.strip():
1060                 continue
1061             else:
1062                 if last_info is None:
1063                     formats.append({'url': format_url(line)})
1064                     continue
1065                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1066                 format_id = []
1067                 if m3u8_id:
1068                     format_id.append(m3u8_id)
1069                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1070                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1071                 f = {
1072                     'format_id': '-'.join(format_id),
1073                     'url': format_url(line.strip()),
1074                     'tbr': tbr,
1075                     'ext': ext,
1076                     'protocol': entry_protocol,
1077                     'preference': preference,
1078                 }
1079                 codecs = last_info.get('CODECS')
1080                 if codecs:
1081                     # TODO: looks like video codec is not always necessarily goes first
1082                     va_codecs = codecs.split(',')
1083                     if va_codecs[0]:
1084                         f['vcodec'] = va_codecs[0]
1085                     if len(va_codecs) > 1 and va_codecs[1]:
1086                         f['acodec'] = va_codecs[1]
1087                 resolution = last_info.get('RESOLUTION')
1088                 if resolution:
1089                     width_str, height_str = resolution.split('x')
1090                     f['width'] = int(width_str)
1091                     f['height'] = int(height_str)
1092                 if last_media is not None:
1093                     f['m3u8_media'] = last_media
1094                     last_media = None
1095                 formats.append(f)
1096                 last_info = {}
1097         self._sort_formats(formats)
1098         return formats
1099
1100     @staticmethod
1101     def _xpath_ns(path, namespace=None):
1102         if not namespace:
1103             return path
1104         out = []
1105         for c in path.split('/'):
1106             if not c or c == '.':
1107                 out.append(c)
1108             else:
1109                 out.append('{%s}%s' % (namespace, c))
1110         return '/'.join(out)
1111
1112     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1113         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1114
1115         if smil is False:
1116             assert not fatal
1117             return []
1118
1119         namespace = self._parse_smil_namespace(smil)
1120
1121         return self._parse_smil_formats(
1122             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1123
1124     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1125         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1126         if smil is False:
1127             return {}
1128         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1129
1130     def _download_smil(self, smil_url, video_id, fatal=True):
1131         return self._download_xml(
1132             smil_url, video_id, 'Downloading SMIL file',
1133             'Unable to download SMIL file', fatal=fatal)
1134
1135     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1136         namespace = self._parse_smil_namespace(smil)
1137
1138         formats = self._parse_smil_formats(
1139             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1140         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1141
1142         video_id = os.path.splitext(url_basename(smil_url))[0]
1143         title = None
1144         description = None
1145         upload_date = None
1146         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1147             name = meta.attrib.get('name')
1148             content = meta.attrib.get('content')
1149             if not name or not content:
1150                 continue
1151             if not title and name == 'title':
1152                 title = content
1153             elif not description and name in ('description', 'abstract'):
1154                 description = content
1155             elif not upload_date and name == 'date':
1156                 upload_date = unified_strdate(content)
1157
1158         thumbnails = [{
1159             'id': image.get('type'),
1160             'url': image.get('src'),
1161             'width': int_or_none(image.get('width')),
1162             'height': int_or_none(image.get('height')),
1163         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1164
1165         return {
1166             'id': video_id,
1167             'title': title or video_id,
1168             'description': description,
1169             'upload_date': upload_date,
1170             'thumbnails': thumbnails,
1171             'formats': formats,
1172             'subtitles': subtitles,
1173         }
1174
1175     def _parse_smil_namespace(self, smil):
1176         return self._search_regex(
1177             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1178
1179     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1180         base = smil_url
1181         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1182             b = meta.get('base') or meta.get('httpBase')
1183             if b:
1184                 base = b
1185                 break
1186
1187         formats = []
1188         rtmp_count = 0
1189         http_count = 0
1190         m3u8_count = 0
1191
1192         src_urls = []
1193         videos = smil.findall(self._xpath_ns('.//video', namespace))
1194         for video in videos:
1195             src = video.get('src')
1196             if not src:
1197                 continue
1198
1199             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1200             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1201             width = int_or_none(video.get('width'))
1202             height = int_or_none(video.get('height'))
1203             proto = video.get('proto')
1204             ext = video.get('ext')
1205             src_ext = determine_ext(src)
1206             streamer = video.get('streamer') or base
1207
1208             if proto == 'rtmp' or streamer.startswith('rtmp'):
1209                 rtmp_count += 1
1210                 formats.append({
1211                     'url': streamer,
1212                     'play_path': src,
1213                     'ext': 'flv',
1214                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1215                     'tbr': bitrate,
1216                     'filesize': filesize,
1217                     'width': width,
1218                     'height': height,
1219                 })
1220                 if transform_rtmp_url:
1221                     streamer, src = transform_rtmp_url(streamer, src)
1222                     formats[-1].update({
1223                         'url': streamer,
1224                         'play_path': src,
1225                     })
1226                 continue
1227
1228             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1229             if src_url in src_urls:
1230                 continue
1231             src_urls.append(src_url)
1232
1233             if proto == 'm3u8' or src_ext == 'm3u8':
1234                 m3u8_formats = self._extract_m3u8_formats(
1235                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1236                 if len(m3u8_formats) == 1:
1237                     m3u8_count += 1
1238                     m3u8_formats[0].update({
1239                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1240                         'tbr': bitrate,
1241                         'width': width,
1242                         'height': height,
1243                     })
1244                 formats.extend(m3u8_formats)
1245                 continue
1246
1247             if src_ext == 'f4m':
1248                 f4m_url = src_url
1249                 if not f4m_params:
1250                     f4m_params = {
1251                         'hdcore': '3.2.0',
1252                         'plugin': 'flowplayer-3.2.0.1',
1253                     }
1254                 f4m_url += '&' if '?' in f4m_url else '?'
1255                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1256                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1257                 continue
1258
1259             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1260                 http_count += 1
1261                 formats.append({
1262                     'url': src_url,
1263                     'ext': ext or src_ext or 'flv',
1264                     'format_id': 'http-%d' % (bitrate or http_count),
1265                     'tbr': bitrate,
1266                     'filesize': filesize,
1267                     'width': width,
1268                     'height': height,
1269                 })
1270                 continue
1271
1272         self._sort_formats(formats)
1273
1274         return formats
1275
1276     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1277         urls = []
1278         subtitles = {}
1279         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1280             src = textstream.get('src')
1281             if not src or src in urls:
1282                 continue
1283             urls.append(src)
1284             ext = textstream.get('ext') or determine_ext(src)
1285             if not ext:
1286                 type_ = textstream.get('type')
1287                 SUBTITLES_TYPES = {
1288                     'text/vtt': 'vtt',
1289                     'text/srt': 'srt',
1290                     'application/smptett+xml': 'tt',
1291                 }
1292                 if type_ in SUBTITLES_TYPES:
1293                     ext = SUBTITLES_TYPES[type_]
1294             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1295             subtitles.setdefault(lang, []).append({
1296                 'url': src,
1297                 'ext': ext,
1298             })
1299         return subtitles
1300
1301     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1302         xspf = self._download_xml(
1303             playlist_url, playlist_id, 'Downloading xpsf playlist',
1304             'Unable to download xspf manifest', fatal=fatal)
1305         if xspf is False:
1306             return []
1307         return self._parse_xspf(xspf, playlist_id)
1308
1309     def _parse_xspf(self, playlist, playlist_id):
1310         NS_MAP = {
1311             'xspf': 'http://xspf.org/ns/0/',
1312             's1': 'http://static.streamone.nl/player/ns/0',
1313         }
1314
1315         entries = []
1316         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1317             title = xpath_text(
1318                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1319             description = xpath_text(
1320                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1321             thumbnail = xpath_text(
1322                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1323             duration = float_or_none(
1324                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1325
1326             formats = [{
1327                 'url': location.text,
1328                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1329                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1330                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1331             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1332             self._sort_formats(formats)
1333
1334             entries.append({
1335                 'id': playlist_id,
1336                 'title': title,
1337                 'description': description,
1338                 'thumbnail': thumbnail,
1339                 'duration': duration,
1340                 'formats': formats,
1341             })
1342         return entries
1343
1344     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1345         res = self._download_webpage_handle(
1346             mpd_url, video_id,
1347             note=note or 'Downloading MPD manifest',
1348             errnote=errnote or 'Failed to download MPD manifest',
1349             fatal=fatal)
1350         if res is False:
1351             return []
1352         mpd, urlh = res
1353         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1354
1355         return self._parse_mpd_formats(
1356             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1357
1358     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1359         if mpd_doc.get('type') == 'dynamic':
1360             return []
1361
1362         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1363
1364         def _add_ns(path):
1365             return self._xpath_ns(path, namespace)
1366
1367         def is_drm_protected(element):
1368             return element.find(_add_ns('ContentProtection')) is not None
1369
1370         def extract_multisegment_info(element, ms_parent_info):
1371             ms_info = ms_parent_info.copy()
1372             segment_list = element.find(_add_ns('SegmentList'))
1373             if segment_list is not None:
1374                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1375                 if segment_urls_e:
1376                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1377                 initialization = segment_list.find(_add_ns('Initialization'))
1378                 if initialization is not None:
1379                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1380             else:
1381                 segment_template = element.find(_add_ns('SegmentTemplate'))
1382                 if segment_template is not None:
1383                     start_number = segment_template.get('startNumber')
1384                     if start_number:
1385                         ms_info['start_number'] = int(start_number)
1386                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1387                     if segment_timeline is not None:
1388                         s_e = segment_timeline.findall(_add_ns('S'))
1389                         if s_e:
1390                             ms_info['total_number'] = 0
1391                             for s in s_e:
1392                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1393                     else:
1394                         timescale = segment_template.get('timescale')
1395                         if timescale:
1396                             ms_info['timescale'] = int(timescale)
1397                         segment_duration = segment_template.get('duration')
1398                         if segment_duration:
1399                             ms_info['segment_duration'] = int(segment_duration)
1400                     media_template = segment_template.get('media')
1401                     if media_template:
1402                         ms_info['media_template'] = media_template
1403                     initialization = segment_template.get('initialization')
1404                     if initialization:
1405                         ms_info['initialization_url'] = initialization
1406                     else:
1407                         initialization = segment_template.find(_add_ns('Initialization'))
1408                         if initialization is not None:
1409                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1410             return ms_info
1411
1412         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1413         formats = []
1414         for period in mpd_doc.findall(_add_ns('Period')):
1415             period_duration = parse_duration(period.get('duration')) or mpd_duration
1416             period_ms_info = extract_multisegment_info(period, {
1417                 'start_number': 1,
1418                 'timescale': 1,
1419             })
1420             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1421                 if is_drm_protected(adaptation_set):
1422                     continue
1423                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1424                 for representation in adaptation_set.findall(_add_ns('Representation')):
1425                     if is_drm_protected(representation):
1426                         continue
1427                     representation_attrib = adaptation_set.attrib.copy()
1428                     representation_attrib.update(representation.attrib)
1429                     mime_type = representation_attrib.get('mimeType')
1430                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1431                     if content_type == 'text':
1432                         # TODO implement WebVTT downloading
1433                         pass
1434                     elif content_type == 'video' or content_type == 'audio':
1435                         base_url = ''
1436                         for element in (representation, adaptation_set, period, mpd_doc):
1437                             base_url_e = element.find(_add_ns('BaseURL'))
1438                             if base_url_e is not None:
1439                                 base_url = base_url_e.text + base_url
1440                                 if re.match(r'^https?://', base_url):
1441                                     break
1442                         if not re.match(r'^https?://', base_url):
1443                             base_url = mpd_base_url + base_url
1444                         representation_id = representation_attrib.get('id')
1445                         lang = representation_attrib.get('lang')
1446                         url_el = representation.find(_add_ns('BaseURL'))
1447                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1448                         f = {
1449                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1450                             'url': base_url,
1451                             'width': int_or_none(representation_attrib.get('width')),
1452                             'height': int_or_none(representation_attrib.get('height')),
1453                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1454                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1455                             'fps': int_or_none(representation_attrib.get('frameRate')),
1456                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1457                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1458                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1459                             'format_note': 'DASH %s' % content_type,
1460                             'filesize': filesize,
1461                         }
1462                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1463                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1464                             if 'total_number' not in representation_ms_info and 'segment_duration':
1465                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1466                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1467                             media_template = representation_ms_info['media_template']
1468                             media_template = media_template.replace('$RepresentationID$', representation_id)
1469                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1470                             media_template.replace('$$', '$')
1471                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1472                         if 'segment_urls' in representation_ms_info:
1473                             f.update({
1474                                 'segment_urls': representation_ms_info['segment_urls'],
1475                                 'protocol': 'http_dash_segments',
1476                             })
1477                             if 'initialization_url' in representation_ms_info:
1478                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1479                                 f.update({
1480                                     'initialization_url': initialization_url,
1481                                 })
1482                                 if not f.get('url'):
1483                                     f['url'] = initialization_url
1484                         try:
1485                             existing_format = next(
1486                                 fo for fo in formats
1487                                 if fo['format_id'] == representation_id)
1488                         except StopIteration:
1489                             full_info = formats_dict.get(representation_id, {}).copy()
1490                             full_info.update(f)
1491                             formats.append(full_info)
1492                         else:
1493                             existing_format.update(f)
1494                     else:
1495                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1496         self._sort_formats(formats)
1497         return formats
1498
1499     def _live_title(self, name):
1500         """ Generate the title for a live video """
1501         now = datetime.datetime.now()
1502         now_str = now.strftime("%Y-%m-%d %H:%M")
1503         return name + ' ' + now_str
1504
1505     def _int(self, v, name, fatal=False, **kwargs):
1506         res = int_or_none(v, **kwargs)
1507         if 'get_attr' in kwargs:
1508             print(getattr(v, kwargs['get_attr']))
1509         if res is None:
1510             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1511             if fatal:
1512                 raise ExtractorError(msg)
1513             else:
1514                 self._downloader.report_warning(msg)
1515         return res
1516
1517     def _float(self, v, name, fatal=False, **kwargs):
1518         res = float_or_none(v, **kwargs)
1519         if res is None:
1520             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1521             if fatal:
1522                 raise ExtractorError(msg)
1523             else:
1524                 self._downloader.report_warning(msg)
1525         return res
1526
1527     def _set_cookie(self, domain, name, value, expire_time=None):
1528         cookie = compat_cookiejar.Cookie(
1529             0, name, value, None, None, domain, None,
1530             None, '/', True, False, expire_time, '', None, None, None)
1531         self._downloader.cookiejar.set_cookie(cookie)
1532
1533     def _get_cookies(self, url):
1534         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1535         req = sanitized_Request(url)
1536         self._downloader.cookiejar.add_cookie_header(req)
1537         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1538
1539     def get_testcases(self, include_onlymatching=False):
1540         t = getattr(self, '_TEST', None)
1541         if t:
1542             assert not hasattr(self, '_TESTS'), \
1543                 '%s has _TEST and _TESTS' % type(self).__name__
1544             tests = [t]
1545         else:
1546             tests = getattr(self, '_TESTS', [])
1547         for t in tests:
1548             if not include_onlymatching and t.get('only_matching', False):
1549                 continue
1550             t['name'] = type(self).__name__[:-len('IE')]
1551             yield t
1552
1553     def is_suitable(self, age_limit):
1554         """ Test whether the extractor is generally suitable for the given
1555         age limit (i.e. pornographic sites are not, all others usually are) """
1556
1557         any_restricted = False
1558         for tc in self.get_testcases(include_onlymatching=False):
1559             if 'playlist' in tc:
1560                 tc = tc['playlist'][0]
1561             is_restricted = age_restricted(
1562                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1563             if not is_restricted:
1564                 return True
1565             any_restricted = any_restricted or is_restricted
1566         return not any_restricted
1567
1568     def extract_subtitles(self, *args, **kwargs):
1569         if (self._downloader.params.get('writesubtitles', False) or
1570                 self._downloader.params.get('listsubtitles')):
1571             return self._get_subtitles(*args, **kwargs)
1572         return {}
1573
1574     def _get_subtitles(self, *args, **kwargs):
1575         raise NotImplementedError("This method must be implemented by subclasses")
1576
1577     @staticmethod
1578     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1579         """ Merge subtitle items for one language. Items with duplicated URLs
1580         will be dropped. """
1581         list1_urls = set([item['url'] for item in subtitle_list1])
1582         ret = list(subtitle_list1)
1583         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1584         return ret
1585
1586     @classmethod
1587     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1588         """ Merge two subtitle dictionaries, language by language. """
1589         ret = dict(subtitle_dict1)
1590         for lang in subtitle_dict2:
1591             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1592         return ret
1593
1594     def extract_automatic_captions(self, *args, **kwargs):
1595         if (self._downloader.params.get('writeautomaticsub', False) or
1596                 self._downloader.params.get('listsubtitles')):
1597             return self._get_automatic_captions(*args, **kwargs)
1598         return {}
1599
1600     def _get_automatic_captions(self, *args, **kwargs):
1601         raise NotImplementedError("This method must be implemented by subclasses")
1602
1603
1604 class SearchInfoExtractor(InfoExtractor):
1605     """
1606     Base class for paged search queries extractors.
1607     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1608     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1609     """
1610
1611     @classmethod
1612     def _make_valid_url(cls):
1613         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1614
1615     @classmethod
1616     def suitable(cls, url):
1617         return re.match(cls._make_valid_url(), url) is not None
1618
1619     def _real_extract(self, query):
1620         mobj = re.match(self._make_valid_url(), query)
1621         if mobj is None:
1622             raise ExtractorError('Invalid search query "%s"' % query)
1623
1624         prefix = mobj.group('prefix')
1625         query = mobj.group('query')
1626         if prefix == '':
1627             return self._get_n_results(query, 1)
1628         elif prefix == 'all':
1629             return self._get_n_results(query, self._MAX_RESULTS)
1630         else:
1631             n = int(prefix)
1632             if n <= 0:
1633                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1634             elif n > self._MAX_RESULTS:
1635                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1636                 n = self._MAX_RESULTS
1637             return self._get_n_results(query, n)
1638
1639     def _get_n_results(self, query, n):
1640         """Get a specified number of results for a query"""
1641         raise NotImplementedError("This method must be implemented by subclasses")
1642
1643     @property
1644     def SEARCH_KEY(self):
1645         return self._SEARCH_KEY