_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49 )
  50
  51
  52 class InfoExtractor(object):
  53     """Information Extractor class.
  54
  55     Information extractors are the classes that, given a URL, extract
  56     information about the video (or videos) the URL refers to. This
  57     information includes the real video URL, the video title, author and
  58     others. The information is stored in a dictionary which is then
  59     passed to the YoutubeDL. The YoutubeDL processes this
  60     information possibly downloading the video to the file system, among
  61     other possible outcomes.
  62
  63     The type field determines the type of the result.
  64     By far the most common value (and the default if _type is missing) is
  65     "video", which indicates a single video.
  66
  67     For a video, the dictionaries must include the following fields:
  68
  69     id:             Video identifier.
  70     title:          Video title, unescaped.
  71
  72     Additionally, it must contain either a formats entry or a url one:
  73
  74     formats:        A list of dictionaries for each format available, ordered
  75                     from worst to best quality.
  76
  77                     Potential fields:
  78                     * url        Mandatory. The URL of the video file
  79                     * ext        Will be calculated from URL if missing
  80                     * format     A human-readable description of the format
  81                                  ("mp4 container with h264/opus").
  82                                  Calculated from the format_id, width, height.
  83                                  and format_note fields if missing.
  84                     * format_id  A short description of the format
  85                                  ("mp4_h264_opus" or "19").
  86                                 Technically optional, but strongly recommended.
  87                     * format_note Additional info about the format
  88                                  ("3D" or "DASH video")
  89                     * width      Width of the video, if known
  90                     * height     Height of the video, if known
  91                     * resolution Textual description of width and height
  92                     * tbr        Average bitrate of audio and video in KBit/s
  93                     * abr        Average audio bitrate in KBit/s
  94                     * acodec     Name of the audio codec in use
  95                     * asr        Audio sampling rate in Hertz
  96                     * vbr        Average video bitrate in KBit/s
  97                     * fps        Frame rate
  98                     * vcodec     Name of the video codec in use
  99                     * container  Name of the container format
 100                     * filesize   The number of bytes, if known in advance
 101                     * filesize_approx  An estimate for the number of bytes
 102                     * player_url SWF Player URL (used for rtmpdump).
 103                     * protocol   The protocol that will be used for the actual
 104                                  download, lower-case.
 105                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 106                                  "m3u8", or "m3u8_native".
 107                     * preference Order number of this format. If this field is
 108                                  present and not None, the formats get sorted
 109                                  by this field, regardless of all other values.
 110                                  -1 for default (order by other properties),
 111                                  -2 or smaller for less than default.
 112                                  < -1000 to hide the format (if there is
 113                                     another one which is strictly better)
 114                     * language   Language code, e.g. "de" or "en-US".
 115                     * language_preference  Is this in the language mentioned in
 116                                  the URL?
 117                                  10 if it's what the URL is about,
 118                                  -1 for default (don't know),
 119                                  -10 otherwise, other values reserved for now.
 120                     * quality    Order number of the video quality of this
 121                                  format, irrespective of the file format.
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * source_preference  Order number for this video source
 125                                   (quality takes higher priority)
 126                                  -1 for default (order by other properties),
 127                                  -2 or smaller for less than default.
 128                     * http_headers  A dictionary of additional HTTP headers
 129                                  to add to the request.
 130                     * stretched_ratio  If given and not 1, indicates that the
 131                                  video's pixels are not square.
 132                                  width : height ratio as float.
 133                     * no_resume  The server does not support resuming the
 134                                  (HTTP or RTMP) download. Boolean.
 135
 136     url:            Final video URL.
 137     ext:            Video filename extension.
 138     format:         The video format, defaults to ext (used for --get-format)
 139     player_url:     SWF Player URL (used for rtmpdump).
 140
 141     The following fields are optional:
 142
 143     alt_title:      A secondary title of the video.
 144     display_id      An alternative identifier for the video, not necessarily
 145                     unique, but available before title. Typically, id is
 146                     something like "4234987", title "Dancing naked mole rats",
 147                     and display_id "dancing-naked-mole-rats"
 148     thumbnails:     A list of dictionaries, with the following entries:
 149                         * "id" (optional, string) - Thumbnail format ID
 150                         * "url"
 151                         * "preference" (optional, int) - quality of the image
 152                         * "width" (optional, int)
 153                         * "height" (optional, int)
 154                         * "resolution" (optional, string "{width}x{height"},
 155                                         deprecated)
 156     thumbnail:      Full URL to a video thumbnail image.
 157     description:    Full video description.
 158     uploader:       Full name of the video uploader.
 159     creator:        The main artist who created the video.
 160     release_date:   The date (YYYYMMDD) when the video was released.
 161     timestamp:      UNIX timestamp of the moment the video became available.
 162     upload_date:    Video upload date (YYYYMMDD).
 163                     If not explicitly set, calculated from timestamp.
 164     uploader_id:    Nickname or id of the video uploader.
 165     location:       Physical location where the video was filmed.
 166     subtitles:      The available subtitles as a dictionary in the format
 167                     {language: subformats}. "subformats" is a list sorted from
 168                     lower to higher preference, each element is a dictionary
 169                     with the "ext" entry and one of:
 170                         * "data": The subtitles file contents
 171                         * "url": A URL pointing to the subtitles file
 172                     "ext" will be calculated from URL if missing
 173     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 174                     automatically generated captions
 175     duration:       Length of the video in seconds, as an integer or float.
 176     view_count:     How many users have watched the video on the platform.
 177     like_count:     Number of positive ratings of the video
 178     dislike_count:  Number of negative ratings of the video
 179     repost_count:   Number of reposts of the video
 180     average_rating: Average rating give by users, the scale used depends on the webpage
 181     comment_count:  Number of comments on the video
 182     comments:       A list of comments, each with one or more of the following
 183                     properties (all but one of text or html optional):
 184                         * "author" - human-readable name of the comment author
 185                         * "author_id" - user ID of the comment author
 186                         * "id" - Comment ID
 187                         * "html" - Comment as HTML
 188                         * "text" - Plain text of the comment
 189                         * "timestamp" - UNIX timestamp of comment
 190                         * "parent" - ID of the comment this one is replying to.
 191                                      Set to "root" to indicate that this is a
 192                                      comment to the original video.
 193     age_limit:      Age restriction for the video, as an integer (years)
 194     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 195                     should allow to get the same result again. (It will be set
 196                     by YoutubeDL if it's missing)
 197     categories:     A list of categories that the video falls in, for example
 198                     ["Sports", "Berlin"]
 199     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 200     is_live:        True, False, or None (=unknown). Whether this video is a
 201                     live stream that goes on instead of a fixed-length video.
 202     start_time:     Time in seconds where the reproduction should start, as
 203                     specified in the URL.
 204     end_time:       Time in seconds where the reproduction should end, as
 205                     specified in the URL.
 206
 207     The following fields should only be used when the video belongs to some logical
 208     chapter or section:
 209
 210     chapter:        Name or title of the chapter the video belongs to.
 211     chapter_number: Number of the chapter the video belongs to, as an integer.
 212     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 213
 214     The following fields should only be used when the video is an episode of some
 215     series or programme:
 216
 217     series:         Title of the series or programme the video episode belongs to.
 218     season:         Title of the season the video episode belongs to.
 219     season_number:  Number of the season the video episode belongs to, as an integer.
 220     season_id:      Id of the season the video episode belongs to, as a unicode string.
 221     episode:        Title of the video episode. Unlike mandatory video title field,
 222                     this field should denote the exact title of the video episode
 223                     without any kind of decoration.
 224     episode_number: Number of the video episode within a season, as an integer.
 225     episode_id:     Id of the video episode, as a unicode string.
 226
 227     Unless mentioned otherwise, the fields should be Unicode strings.
 228
 229     Unless mentioned otherwise, None is equivalent to absence of information.
 230
 231
 232     _type "playlist" indicates multiple videos.
 233     There must be a key "entries", which is a list, an iterable, or a PagedList
 234     object, each element of which is a valid dictionary by this specification.
 235
 236     Additionally, playlists can have "title", "description" and "id" attributes
 237     with the same semantics as videos (see above).
 238
 239
 240     _type "multi_video" indicates that there are multiple videos that
 241     form a single show, for examples multiple acts of an opera or TV episode.
 242     It must have an entries key like a playlist and contain all the keys
 243     required for a video at the same time.
 244
 245
 246     _type "url" indicates that the video must be extracted from another
 247     location, possibly by a different extractor. Its only required key is:
 248     "url" - the next URL to extract.
 249     The key "ie_key" can be set to the class name (minus the trailing "IE",
 250     e.g. "Youtube") if the extractor class is known in advance.
 251     Additionally, the dictionary may have any properties of the resolved entity
 252     known in advance, for example "title" if the title of the referred video is
 253     known ahead of time.
 254
 255
 256     _type "url_transparent" entities have the same specification as "url", but
 257     indicate that the given additional information is more precise than the one
 258     associated with the resolved URL.
 259     This is useful when a site employs a video service that hosts the video and
 260     its technical metadata, but that video service does not embed a useful
 261     title, description etc.
 262
 263
 264     Subclasses of this one should re-define the _real_initialize() and
 265     _real_extract() methods and define a _VALID_URL regexp.
 266     Probably, they should also be added to the list of extractors.
 267
 268     Finally, the _WORKING attribute should be set to False for broken IEs
 269     in order to warn the users and skip the tests.
 270     """
 271
 272     _ready = False
 273     _downloader = None
 274     _WORKING = True
 275
 276     def __init__(self, downloader=None):
 277         """Constructor. Receives an optional downloader."""
 278         self._ready = False
 279         self.set_downloader(downloader)
 280
 281     @classmethod
 282     def suitable(cls, url):
 283         """Receives a URL and returns True if suitable for this IE."""
 284
 285         # This does not use has/getattr intentionally - we want to know whether
 286         # we have cached the regexp for *this* class, whereas getattr would also
 287         # match the superclass
 288         if '_VALID_URL_RE' not in cls.__dict__:
 289             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 290         return cls._VALID_URL_RE.match(url) is not None
 291
 292     @classmethod
 293     def _match_id(cls, url):
 294         if '_VALID_URL_RE' not in cls.__dict__:
 295             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 296         m = cls._VALID_URL_RE.match(url)
 297         assert m
 298         return m.group('id')
 299
 300     @classmethod
 301     def working(cls):
 302         """Getter method for _WORKING."""
 303         return cls._WORKING
 304
 305     def initialize(self):
 306         """Initializes an instance (authentication, etc)."""
 307         if not self._ready:
 308             self._real_initialize()
 309             self._ready = True
 310
 311     def extract(self, url):
 312         """Extracts URL information and returns it in list of dicts."""
 313         try:
 314             self.initialize()
 315             return self._real_extract(url)
 316         except ExtractorError:
 317             raise
 318         except compat_http_client.IncompleteRead as e:
 319             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 320         except (KeyError, StopIteration) as e:
 321             raise ExtractorError('An extractor error has occurred.', cause=e)
 322
 323     def set_downloader(self, downloader):
 324         """Sets the downloader for this IE."""
 325         self._downloader = downloader
 326
 327     def _real_initialize(self):
 328         """Real initialization process. Redefine in subclasses."""
 329         pass
 330
 331     def _real_extract(self, url):
 332         """Real extraction process. Redefine in subclasses."""
 333         pass
 334
 335     @classmethod
 336     def ie_key(cls):
 337         """A string for getting the InfoExtractor with get_info_extractor"""
 338         return compat_str(cls.__name__[:-2])
 339
 340     @property
 341     def IE_NAME(self):
 342         return compat_str(type(self).__name__[:-2])
 343
 344     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 345         """ Returns the response handle """
 346         if note is None:
 347             self.report_download_webpage(video_id)
 348         elif note is not False:
 349             if video_id is None:
 350                 self.to_screen('%s' % (note,))
 351             else:
 352                 self.to_screen('%s: %s' % (video_id, note))
 353         try:
 354             return self._downloader.urlopen(url_or_request)
 355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 356             if errnote is False:
 357                 return False
 358             if errnote is None:
 359                 errnote = 'Unable to download webpage'
 360
 361             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 362             if fatal:
 363                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 364             else:
 365                 self._downloader.report_warning(errmsg)
 366                 return False
 367
 368     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 369         """ Returns a tuple (page content as string, URL handle) """
 370         # Strip hashes from the URL (#1038)
 371         if isinstance(url_or_request, (compat_str, str)):
 372             url_or_request = url_or_request.partition('#')[0]
 373
 374         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 375         if urlh is False:
 376             assert not fatal
 377             return False
 378         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 379         return (content, urlh)
 380
 381     @staticmethod
 382     def _guess_encoding_from_content(content_type, webpage_bytes):
 383         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 384         if m:
 385             encoding = m.group(1)
 386         else:
 387             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 388                           webpage_bytes[:1024])
 389             if m:
 390                 encoding = m.group(1).decode('ascii')
 391             elif webpage_bytes.startswith(b'\xff\xfe'):
 392                 encoding = 'utf-16'
 393             else:
 394                 encoding = 'utf-8'
 395
 396         return encoding
 397
 398     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 399         content_type = urlh.headers.get('Content-Type', '')
 400         webpage_bytes = urlh.read()
 401         if prefix is not None:
 402             webpage_bytes = prefix + webpage_bytes
 403         if not encoding:
 404             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 405         if self._downloader.params.get('dump_intermediate_pages', False):
 406             try:
 407                 url = url_or_request.get_full_url()
 408             except AttributeError:
 409                 url = url_or_request
 410             self.to_screen('Dumping request to ' + url)
 411             dump = base64.b64encode(webpage_bytes).decode('ascii')
 412             self._downloader.to_screen(dump)
 413         if self._downloader.params.get('write_pages', False):
 414             try:
 415                 url = url_or_request.get_full_url()
 416             except AttributeError:
 417                 url = url_or_request
 418             basen = '%s_%s' % (video_id, url)
 419             if len(basen) > 240:
 420                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 421                 basen = basen[:240 - len(h)] + h
 422             raw_filename = basen + '.dump'
 423             filename = sanitize_filename(raw_filename, restricted=True)
 424             self.to_screen('Saving request to ' + filename)
 425             # Working around MAX_PATH limitation on Windows (see
 426             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 427             if os.name == 'nt':
 428                 absfilepath = os.path.abspath(filename)
 429                 if len(absfilepath) > 259:
 430                     filename = '\\\\?\\' + absfilepath
 431             with open(filename, 'wb') as outf:
 432                 outf.write(webpage_bytes)
 433
 434         try:
 435             content = webpage_bytes.decode(encoding, 'replace')
 436         except LookupError:
 437             content = webpage_bytes.decode('utf-8', 'replace')
 438
 439         if ('<title>Access to this site is blocked</title>' in content and
 440                 'Websense' in content[:512]):
 441             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 442             blocked_iframe = self._html_search_regex(
 443                 r'<iframe src="([^"]+)"', content,
 444                 'Websense information URL', default=None)
 445             if blocked_iframe:
 446                 msg += ' Visit %s for more details' % blocked_iframe
 447             raise ExtractorError(msg, expected=True)
 448         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 449             msg = (
 450                 'Access to this webpage has been blocked by Indian censorship. '
 451                 'Use a VPN or proxy server (with --proxy) to route around it.')
 452             block_msg = self._html_search_regex(
 453                 r'</h1><p>(.*?)</p>',
 454                 content, 'block message', default=None)
 455             if block_msg:
 456                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 457             raise ExtractorError(msg, expected=True)
 458
 459         return content
 460
 461     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 462         """ Returns the data of the page as a string """
 463         success = False
 464         try_count = 0
 465         while success is False:
 466             try:
 467                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 468                 success = True
 469             except compat_http_client.IncompleteRead as e:
 470                 try_count += 1
 471                 if try_count >= tries:
 472                     raise e
 473                 self._sleep(timeout, video_id)
 474         if res is False:
 475             return res
 476         else:
 477             content, _ = res
 478             return content
 479
 480     def _download_xml(self, url_or_request, video_id,
 481                       note='Downloading XML', errnote='Unable to download XML',
 482                       transform_source=None, fatal=True, encoding=None):
 483         """Return the xml as an xml.etree.ElementTree.Element"""
 484         xml_string = self._download_webpage(
 485             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 486         if xml_string is False:
 487             return xml_string
 488         if transform_source:
 489             xml_string = transform_source(xml_string)
 490         return compat_etree_fromstring(xml_string.encode('utf-8'))
 491
 492     def _download_json(self, url_or_request, video_id,
 493                        note='Downloading JSON metadata',
 494                        errnote='Unable to download JSON metadata',
 495                        transform_source=None,
 496                        fatal=True, encoding=None):
 497         json_string = self._download_webpage(
 498             url_or_request, video_id, note, errnote, fatal=fatal,
 499             encoding=encoding)
 500         if (not fatal) and json_string is False:
 501             return None
 502         return self._parse_json(
 503             json_string, video_id, transform_source=transform_source, fatal=fatal)
 504
 505     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 506         if transform_source:
 507             json_string = transform_source(json_string)
 508         try:
 509             return json.loads(json_string)
 510         except ValueError as ve:
 511             errmsg = '%s: Failed to parse JSON ' % video_id
 512             if fatal:
 513                 raise ExtractorError(errmsg, cause=ve)
 514             else:
 515                 self.report_warning(errmsg + str(ve))
 516
 517     def report_warning(self, msg, video_id=None):
 518         idstr = '' if video_id is None else '%s: ' % video_id
 519         self._downloader.report_warning(
 520             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 521
 522     def to_screen(self, msg):
 523         """Print msg to screen, prefixing it with '[ie_name]'"""
 524         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 525
 526     def report_extraction(self, id_or_name):
 527         """Report information extraction."""
 528         self.to_screen('%s: Extracting information' % id_or_name)
 529
 530     def report_download_webpage(self, video_id):
 531         """Report webpage download."""
 532         self.to_screen('%s: Downloading webpage' % video_id)
 533
 534     def report_age_confirmation(self):
 535         """Report attempt to confirm age."""
 536         self.to_screen('Confirming age')
 537
 538     def report_login(self):
 539         """Report attempt to log in."""
 540         self.to_screen('Logging in')
 541
 542     @staticmethod
 543     def raise_login_required(msg='This video is only available for registered users'):
 544         raise ExtractorError(
 545             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 546             expected=True)
 547
 548     @staticmethod
 549     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 550         raise ExtractorError(
 551             '%s. You might want to use --proxy to workaround.' % msg,
 552             expected=True)
 553
 554     # Methods for following #608
 555     @staticmethod
 556     def url_result(url, ie=None, video_id=None, video_title=None):
 557         """Returns a URL that points to a page that should be processed"""
 558         # TODO: ie should be the class used for getting the info
 559         video_info = {'_type': 'url',
 560                       'url': url,
 561                       'ie_key': ie}
 562         if video_id is not None:
 563             video_info['id'] = video_id
 564         if video_title is not None:
 565             video_info['title'] = video_title
 566         return video_info
 567
 568     @staticmethod
 569     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 570         """Returns a playlist"""
 571         video_info = {'_type': 'playlist',
 572                       'entries': entries}
 573         if playlist_id:
 574             video_info['id'] = playlist_id
 575         if playlist_title:
 576             video_info['title'] = playlist_title
 577         if playlist_description:
 578             video_info['description'] = playlist_description
 579         return video_info
 580
 581     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 582         """
 583         Perform a regex search on the given string, using a single or a list of
 584         patterns returning the first matching group.
 585         In case of failure return a default value or raise a WARNING or a
 586         RegexNotFoundError, depending on fatal, specifying the field name.
 587         """
 588         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 589             mobj = re.search(pattern, string, flags)
 590         else:
 591             for p in pattern:
 592                 mobj = re.search(p, string, flags)
 593                 if mobj:
 594                     break
 595
 596         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 597             _name = '\033[0;34m%s\033[0m' % name
 598         else:
 599             _name = name
 600
 601         if mobj:
 602             if group is None:
 603                 # return the first matching group
 604                 return next(g for g in mobj.groups() if g is not None)
 605             else:
 606                 return mobj.group(group)
 607         elif default is not NO_DEFAULT:
 608             return default
 609         elif fatal:
 610             raise RegexNotFoundError('Unable to extract %s' % _name)
 611         else:
 612             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 613             return None
 614
 615     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 616         """
 617         Like _search_regex, but strips HTML tags and unescapes entities.
 618         """
 619         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 620         if res:
 621             return clean_html(res).strip()
 622         else:
 623             return res
 624
 625     def _get_login_info(self):
 626         """
 627         Get the login info as (username, password)
 628         It will look in the netrc file using the _NETRC_MACHINE value
 629         If there's no info available, return (None, None)
 630         """
 631         if self._downloader is None:
 632             return (None, None)
 633
 634         username = None
 635         password = None
 636         downloader_params = self._downloader.params
 637
 638         # Attempt to use provided username and password or .netrc data
 639         if downloader_params.get('username', None) is not None:
 640             username = downloader_params['username']
 641             password = downloader_params['password']
 642         elif downloader_params.get('usenetrc', False):
 643             try:
 644                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 645                 if info is not None:
 646                     username = info[0]
 647                     password = info[2]
 648                 else:
 649                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 650             except (IOError, netrc.NetrcParseError) as err:
 651                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 652
 653         return (username, password)
 654
 655     def _get_tfa_info(self, note='two-factor verification code'):
 656         """
 657         Get the two-factor authentication info
 658         TODO - asking the user will be required for sms/phone verify
 659         currently just uses the command line option
 660         If there's no info available, return None
 661         """
 662         if self._downloader is None:
 663             return None
 664         downloader_params = self._downloader.params
 665
 666         if downloader_params.get('twofactor', None) is not None:
 667             return downloader_params['twofactor']
 668
 669         return compat_getpass('Type %s and press [Return]: ' % note)
 670
 671     # Helper functions for extracting OpenGraph info
 672     @staticmethod
 673     def _og_regexes(prop):
 674         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 675         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 676                        % {'prop': re.escape(prop)})
 677         template = r'<meta[^>]+?%s[^>]+?%s'
 678         return [
 679             template % (property_re, content_re),
 680             template % (content_re, property_re),
 681         ]
 682
 683     @staticmethod
 684     def _meta_regex(prop):
 685         return r'''(?isx)<meta
 686                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 687                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 688
 689     def _og_search_property(self, prop, html, name=None, **kargs):
 690         if name is None:
 691             name = 'OpenGraph %s' % prop
 692         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 693         if escaped is None:
 694             return None
 695         return unescapeHTML(escaped)
 696
 697     def _og_search_thumbnail(self, html, **kargs):
 698         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 699
 700     def _og_search_description(self, html, **kargs):
 701         return self._og_search_property('description', html, fatal=False, **kargs)
 702
 703     def _og_search_title(self, html, **kargs):
 704         return self._og_search_property('title', html, **kargs)
 705
 706     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 707         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 708         if secure:
 709             regexes = self._og_regexes('video:secure_url') + regexes
 710         return self._html_search_regex(regexes, html, name, **kargs)
 711
 712     def _og_search_url(self, html, **kargs):
 713         return self._og_search_property('url', html, **kargs)
 714
 715     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 716         if display_name is None:
 717             display_name = name
 718         return self._html_search_regex(
 719             self._meta_regex(name),
 720             html, display_name, fatal=fatal, group='content', **kwargs)
 721
 722     def _dc_search_uploader(self, html):
 723         return self._html_search_meta('dc.creator', html, 'uploader')
 724
 725     def _rta_search(self, html):
 726         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 727         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 728                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 729                      html):
 730             return 18
 731         return 0
 732
 733     def _media_rating_search(self, html):
 734         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 735         rating = self._html_search_meta('rating', html)
 736
 737         if not rating:
 738             return None
 739
 740         RATING_TABLE = {
 741             'safe for kids': 0,
 742             'general': 8,
 743             '14 years': 14,
 744             'mature': 17,
 745             'restricted': 19,
 746         }
 747         return RATING_TABLE.get(rating.lower(), None)
 748
 749     def _family_friendly_search(self, html):
 750         # See http://schema.org/VideoObject
 751         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 752
 753         if not family_friendly:
 754             return None
 755
 756         RATING_TABLE = {
 757             '1': 0,
 758             'true': 0,
 759             '0': 18,
 760             'false': 18,
 761         }
 762         return RATING_TABLE.get(family_friendly.lower(), None)
 763
 764     def _twitter_search_player(self, html):
 765         return self._html_search_meta('twitter:player', html,
 766                                       'twitter card player')
 767
 768     def _search_json_ld(self, html, video_id, **kwargs):
 769         json_ld = self._search_regex(
 770             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 771             html, 'JSON-LD', group='json_ld', **kwargs)
 772         if not json_ld:
 773             return {}
 774         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 775
 776     def _json_ld(self, json_ld, video_id, fatal=True):
 777         if isinstance(json_ld, compat_str):
 778             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 779         if not json_ld:
 780             return {}
 781         info = {}
 782         if json_ld.get('@context') == 'http://schema.org':
 783             item_type = json_ld.get('@type')
 784             if item_type == 'TVEpisode':
 785                 info.update({
 786                     'episode': unescapeHTML(json_ld.get('name')),
 787                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 788                     'description': unescapeHTML(json_ld.get('description')),
 789                 })
 790                 part_of_season = json_ld.get('partOfSeason')
 791                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 792                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 793                 part_of_series = json_ld.get('partOfSeries')
 794                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 795                     info['series'] = unescapeHTML(part_of_series.get('name'))
 796             elif item_type == 'Article':
 797                 info.update({
 798                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 799                     'title': unescapeHTML(json_ld.get('headline')),
 800                     'description': unescapeHTML(json_ld.get('articleBody')),
 801                 })
 802         return dict((k, v) for k, v in info.items() if v is not None)
 803
 804     @staticmethod
 805     def _hidden_inputs(html):
 806         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 807         hidden_inputs = {}
 808         for input in re.findall(r'(?i)<input([^>]+)>', html):
 809             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 810                 continue
 811             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 812             if not name:
 813                 continue
 814             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 815             if not value:
 816                 continue
 817             hidden_inputs[name.group('value')] = value.group('value')
 818         return hidden_inputs
 819
 820     def _form_hidden_inputs(self, form_id, html):
 821         form = self._search_regex(
 822             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 823             html, '%s form' % form_id, group='form')
 824         return self._hidden_inputs(form)
 825
 826     def _sort_formats(self, formats, field_preference=None):
 827         if not formats:
 828             raise ExtractorError('No video formats found')
 829
 830         for f in formats:
 831             # Automatically determine tbr when missing based on abr and vbr (improves
 832             # formats sorting in some cases)
 833             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 834                 f['tbr'] = f['abr'] + f['vbr']
 835
 836         def _formats_key(f):
 837             # TODO remove the following workaround
 838             from ..utils import determine_ext
 839             if not f.get('ext') and 'url' in f:
 840                 f['ext'] = determine_ext(f['url'])
 841
 842             if isinstance(field_preference, (list, tuple)):
 843                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 844
 845             preference = f.get('preference')
 846             if preference is None:
 847                 preference = 0
 848                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 849                     preference -= 0.5
 850
 851             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 852
 853             if f.get('vcodec') == 'none':  # audio only
 854                 if self._downloader.params.get('prefer_free_formats'):
 855                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 856                 else:
 857                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 858                 ext_preference = 0
 859                 try:
 860                     audio_ext_preference = ORDER.index(f['ext'])
 861                 except ValueError:
 862                     audio_ext_preference = -1
 863             else:
 864                 if self._downloader.params.get('prefer_free_formats'):
 865                     ORDER = ['flv', 'mp4', 'webm']
 866                 else:
 867                     ORDER = ['webm', 'flv', 'mp4']
 868                 try:
 869                     ext_preference = ORDER.index(f['ext'])
 870                 except ValueError:
 871                     ext_preference = -1
 872                 audio_ext_preference = 0
 873
 874             return (
 875                 preference,
 876                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 877                 f.get('quality') if f.get('quality') is not None else -1,
 878                 f.get('tbr') if f.get('tbr') is not None else -1,
 879                 f.get('filesize') if f.get('filesize') is not None else -1,
 880                 f.get('vbr') if f.get('vbr') is not None else -1,
 881                 f.get('height') if f.get('height') is not None else -1,
 882                 f.get('width') if f.get('width') is not None else -1,
 883                 proto_preference,
 884                 ext_preference,
 885                 f.get('abr') if f.get('abr') is not None else -1,
 886                 audio_ext_preference,
 887                 f.get('fps') if f.get('fps') is not None else -1,
 888                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 889                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 890                 f.get('format_id') if f.get('format_id') is not None else '',
 891             )
 892         formats.sort(key=_formats_key)
 893
 894     def _check_formats(self, formats, video_id):
 895         if formats:
 896             formats[:] = filter(
 897                 lambda f: self._is_valid_url(
 898                     f['url'], video_id,
 899                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 900                 formats)
 901
 902     def _is_valid_url(self, url, video_id, item='video'):
 903         url = self._proto_relative_url(url, scheme='http:')
 904         # For now assume non HTTP(S) URLs always valid
 905         if not (url.startswith('http://') or url.startswith('https://')):
 906             return True
 907         try:
 908             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 909             return True
 910         except ExtractorError as e:
 911             if isinstance(e.cause, compat_urllib_error.URLError):
 912                 self.to_screen(
 913                     '%s: %s URL is invalid, skipping' % (video_id, item))
 914                 return False
 915             raise
 916
 917     def http_scheme(self):
 918         """ Either "http:" or "https:", depending on the user's preferences """
 919         return (
 920             'http:'
 921             if self._downloader.params.get('prefer_insecure', False)
 922             else 'https:')
 923
 924     def _proto_relative_url(self, url, scheme=None):
 925         if url is None:
 926             return url
 927         if url.startswith('//'):
 928             if scheme is None:
 929                 scheme = self.http_scheme()
 930             return scheme + url
 931         else:
 932             return url
 933
 934     def _sleep(self, timeout, video_id, msg_template=None):
 935         if msg_template is None:
 936             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 937         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 938         self.to_screen(msg)
 939         time.sleep(timeout)
 940
 941     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 942                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 943                              fatal=True):
 944         manifest = self._download_xml(
 945             manifest_url, video_id, 'Downloading f4m manifest',
 946             'Unable to download f4m manifest',
 947             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 948             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 949             transform_source=transform_source,
 950             fatal=fatal)
 951
 952         if manifest is False:
 953             return []
 954
 955         formats = []
 956         manifest_version = '1.0'
 957         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 958         if not media_nodes:
 959             manifest_version = '2.0'
 960             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 961         base_url = xpath_text(
 962             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 963             'base URL', default=None)
 964         if base_url:
 965             base_url = base_url.strip()
 966         for i, media_el in enumerate(media_nodes):
 967             if manifest_version == '2.0':
 968                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 969                 if not media_url:
 970                     continue
 971                 manifest_url = (
 972                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 973                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 974                 # If media_url is itself a f4m manifest do the recursive extraction
 975                 # since bitrates in parent manifest (this one) and media_url manifest
 976                 # may differ leading to inability to resolve the format by requested
 977                 # bitrate in f4m downloader
 978                 if determine_ext(manifest_url) == 'f4m':
 979                     formats.extend(self._extract_f4m_formats(
 980                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 981                     continue
 982             tbr = int_or_none(media_el.attrib.get('bitrate'))
 983             formats.append({
 984                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 985                 'url': manifest_url,
 986                 'ext': 'flv',
 987                 'tbr': tbr,
 988                 'width': int_or_none(media_el.attrib.get('width')),
 989                 'height': int_or_none(media_el.attrib.get('height')),
 990                 'preference': preference,
 991             })
 992         self._sort_formats(formats)
 993
 994         return formats
 995
 996     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 997                               entry_protocol='m3u8', preference=None,
 998                               m3u8_id=None, note=None, errnote=None,
 999                               fatal=True):
1000
1001         formats = [{
1002             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1003             'url': m3u8_url,
1004             'ext': ext,
1005             'protocol': 'm3u8',
1006             'preference': preference - 1 if preference else -1,
1007             'resolution': 'multiple',
1008             'format_note': 'Quality selection URL',
1009         }]
1010
1011         format_url = lambda u: (
1012             u
1013             if re.match(r'^https?://', u)
1014             else compat_urlparse.urljoin(m3u8_url, u))
1015
1016         res = self._download_webpage_handle(
1017             m3u8_url, video_id,
1018             note=note or 'Downloading m3u8 information',
1019             errnote=errnote or 'Failed to download m3u8 information',
1020             fatal=fatal)
1021         if res is False:
1022             return []
1023         m3u8_doc, urlh = res
1024         m3u8_url = urlh.geturl()
1025         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1026         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1027         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1028         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1029         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1030             return [{
1031                 'url': m3u8_url,
1032                 'format_id': m3u8_id,
1033                 'ext': ext,
1034                 'protocol': entry_protocol,
1035                 'preference': preference,
1036             }]
1037         last_info = None
1038         last_media = None
1039         kv_rex = re.compile(
1040             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1041         for line in m3u8_doc.splitlines():
1042             if line.startswith('#EXT-X-STREAM-INF:'):
1043                 last_info = {}
1044                 for m in kv_rex.finditer(line):
1045                     v = m.group('val')
1046                     if v.startswith('"'):
1047                         v = v[1:-1]
1048                     last_info[m.group('key')] = v
1049             elif line.startswith('#EXT-X-MEDIA:'):
1050                 last_media = {}
1051                 for m in kv_rex.finditer(line):
1052                     v = m.group('val')
1053                     if v.startswith('"'):
1054                         v = v[1:-1]
1055                     last_media[m.group('key')] = v
1056             elif line.startswith('#') or not line.strip():
1057                 continue
1058             else:
1059                 if last_info is None:
1060                     formats.append({'url': format_url(line)})
1061                     continue
1062                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1063                 format_id = []
1064                 if m3u8_id:
1065                     format_id.append(m3u8_id)
1066                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1067                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1068                 f = {
1069                     'format_id': '-'.join(format_id),
1070                     'url': format_url(line.strip()),
1071                     'tbr': tbr,
1072                     'ext': ext,
1073                     'protocol': entry_protocol,
1074                     'preference': preference,
1075                 }
1076                 codecs = last_info.get('CODECS')
1077                 if codecs:
1078                     # TODO: looks like video codec is not always necessarily goes first
1079                     va_codecs = codecs.split(',')
1080                     if va_codecs[0]:
1081                         f['vcodec'] = va_codecs[0]
1082                     if len(va_codecs) > 1 and va_codecs[1]:
1083                         f['acodec'] = va_codecs[1]
1084                 resolution = last_info.get('RESOLUTION')
1085                 if resolution:
1086                     width_str, height_str = resolution.split('x')
1087                     f['width'] = int(width_str)
1088                     f['height'] = int(height_str)
1089                 if last_media is not None:
1090                     f['m3u8_media'] = last_media
1091                     last_media = None
1092                 formats.append(f)
1093                 last_info = {}
1094         self._sort_formats(formats)
1095         return formats
1096
1097     @staticmethod
1098     def _xpath_ns(path, namespace=None):
1099         if not namespace:
1100             return path
1101         out = []
1102         for c in path.split('/'):
1103             if not c or c == '.':
1104                 out.append(c)
1105             else:
1106                 out.append('{%s}%s' % (namespace, c))
1107         return '/'.join(out)
1108
1109     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1110         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1111
1112         if smil is False:
1113             assert not fatal
1114             return []
1115
1116         namespace = self._parse_smil_namespace(smil)
1117
1118         return self._parse_smil_formats(
1119             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1120
1121     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1122         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1123         if smil is False:
1124             return {}
1125         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1126
1127     def _download_smil(self, smil_url, video_id, fatal=True):
1128         return self._download_xml(
1129             smil_url, video_id, 'Downloading SMIL file',
1130             'Unable to download SMIL file', fatal=fatal)
1131
1132     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1133         namespace = self._parse_smil_namespace(smil)
1134
1135         formats = self._parse_smil_formats(
1136             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1137         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1138
1139         video_id = os.path.splitext(url_basename(smil_url))[0]
1140         title = None
1141         description = None
1142         upload_date = None
1143         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1144             name = meta.attrib.get('name')
1145             content = meta.attrib.get('content')
1146             if not name or not content:
1147                 continue
1148             if not title and name == 'title':
1149                 title = content
1150             elif not description and name in ('description', 'abstract'):
1151                 description = content
1152             elif not upload_date and name == 'date':
1153                 upload_date = unified_strdate(content)
1154
1155         thumbnails = [{
1156             'id': image.get('type'),
1157             'url': image.get('src'),
1158             'width': int_or_none(image.get('width')),
1159             'height': int_or_none(image.get('height')),
1160         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1161
1162         return {
1163             'id': video_id,
1164             'title': title or video_id,
1165             'description': description,
1166             'upload_date': upload_date,
1167             'thumbnails': thumbnails,
1168             'formats': formats,
1169             'subtitles': subtitles,
1170         }
1171
1172     def _parse_smil_namespace(self, smil):
1173         return self._search_regex(
1174             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1175
1176     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1177         base = smil_url
1178         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179             b = meta.get('base') or meta.get('httpBase')
1180             if b:
1181                 base = b
1182                 break
1183
1184         formats = []
1185         rtmp_count = 0
1186         http_count = 0
1187         m3u8_count = 0
1188
1189         src_urls = []
1190         videos = smil.findall(self._xpath_ns('.//video', namespace))
1191         for video in videos:
1192             src = video.get('src')
1193             if not src:
1194                 continue
1195
1196             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1197             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1198             width = int_or_none(video.get('width'))
1199             height = int_or_none(video.get('height'))
1200             proto = video.get('proto')
1201             ext = video.get('ext')
1202             src_ext = determine_ext(src)
1203             streamer = video.get('streamer') or base
1204
1205             if proto == 'rtmp' or streamer.startswith('rtmp'):
1206                 rtmp_count += 1
1207                 formats.append({
1208                     'url': streamer,
1209                     'play_path': src,
1210                     'ext': 'flv',
1211                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1212                     'tbr': bitrate,
1213                     'filesize': filesize,
1214                     'width': width,
1215                     'height': height,
1216                 })
1217                 if transform_rtmp_url:
1218                     streamer, src = transform_rtmp_url(streamer, src)
1219                     formats[-1].update({
1220                         'url': streamer,
1221                         'play_path': src,
1222                     })
1223                 continue
1224
1225             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1226             if src_url in src_urls:
1227                 continue
1228             src_urls.append(src_url)
1229
1230             if proto == 'm3u8' or src_ext == 'm3u8':
1231                 m3u8_formats = self._extract_m3u8_formats(
1232                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1233                 if len(m3u8_formats) == 1:
1234                     m3u8_count += 1
1235                     m3u8_formats[0].update({
1236                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1237                         'tbr': bitrate,
1238                         'width': width,
1239                         'height': height,
1240                     })
1241                 formats.extend(m3u8_formats)
1242                 continue
1243
1244             if src_ext == 'f4m':
1245                 f4m_url = src_url
1246                 if not f4m_params:
1247                     f4m_params = {
1248                         'hdcore': '3.2.0',
1249                         'plugin': 'flowplayer-3.2.0.1',
1250                     }
1251                 f4m_url += '&' if '?' in f4m_url else '?'
1252                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1253                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1254                 continue
1255
1256             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1257                 http_count += 1
1258                 formats.append({
1259                     'url': src_url,
1260                     'ext': ext or src_ext or 'flv',
1261                     'format_id': 'http-%d' % (bitrate or http_count),
1262                     'tbr': bitrate,
1263                     'filesize': filesize,
1264                     'width': width,
1265                     'height': height,
1266                 })
1267                 continue
1268
1269         self._sort_formats(formats)
1270
1271         return formats
1272
1273     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1274         urls = []
1275         subtitles = {}
1276         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1277             src = textstream.get('src')
1278             if not src or src in urls:
1279                 continue
1280             urls.append(src)
1281             ext = textstream.get('ext') or determine_ext(src)
1282             if not ext:
1283                 type_ = textstream.get('type')
1284                 SUBTITLES_TYPES = {
1285                     'text/vtt': 'vtt',
1286                     'text/srt': 'srt',
1287                     'application/smptett+xml': 'tt',
1288                 }
1289                 if type_ in SUBTITLES_TYPES:
1290                     ext = SUBTITLES_TYPES[type_]
1291             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1292             subtitles.setdefault(lang, []).append({
1293                 'url': src,
1294                 'ext': ext,
1295             })
1296         return subtitles
1297
1298     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1299         xspf = self._download_xml(
1300             playlist_url, playlist_id, 'Downloading xpsf playlist',
1301             'Unable to download xspf manifest', fatal=fatal)
1302         if xspf is False:
1303             return []
1304         return self._parse_xspf(xspf, playlist_id)
1305
1306     def _parse_xspf(self, playlist, playlist_id):
1307         NS_MAP = {
1308             'xspf': 'http://xspf.org/ns/0/',
1309             's1': 'http://static.streamone.nl/player/ns/0',
1310         }
1311
1312         entries = []
1313         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1314             title = xpath_text(
1315                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1316             description = xpath_text(
1317                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1318             thumbnail = xpath_text(
1319                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1320             duration = float_or_none(
1321                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1322
1323             formats = [{
1324                 'url': location.text,
1325                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1326                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1327                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1328             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1329             self._sort_formats(formats)
1330
1331             entries.append({
1332                 'id': playlist_id,
1333                 'title': title,
1334                 'description': description,
1335                 'thumbnail': thumbnail,
1336                 'duration': duration,
1337                 'formats': formats,
1338             })
1339         return entries
1340
1341     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1342         res = self._download_webpage_handle(
1343             mpd_url, video_id,
1344             note=note or 'Downloading MPD manifest',
1345             errnote=errnote or 'Failed to download MPD manifest',
1346             fatal=fatal)
1347         if res is False:
1348             return []
1349         mpd, urlh = res
1350         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1351
1352         return self._parse_mpd_formats(
1353             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1354
1355     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1356         if mpd_doc.get('type') == 'dynamic':
1357             return []
1358
1359         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1360
1361         def _add_ns(path):
1362             return self._xpath_ns(path, namespace)
1363
1364         def is_drm_protected(element):
1365             return element.find(_add_ns('ContentProtection')) is not None
1366
1367         def extract_multisegment_info(element, ms_parent_info):
1368             ms_info = ms_parent_info.copy()
1369             segment_list = element.find(_add_ns('SegmentList'))
1370             if segment_list is not None:
1371                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1372                 if segment_urls_e:
1373                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1374                 initialization = segment_list.find(_add_ns('Initialization'))
1375                 if initialization is not None:
1376                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1377             else:
1378                 segment_template = element.find(_add_ns('SegmentTemplate'))
1379                 if segment_template is not None:
1380                     start_number = segment_template.get('startNumber')
1381                     if start_number:
1382                         ms_info['start_number'] = int(start_number)
1383                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1384                     if segment_timeline is not None:
1385                         s_e = segment_timeline.findall(_add_ns('S'))
1386                         if s_e:
1387                             ms_info['total_number'] = 0
1388                             for s in s_e:
1389                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1390                     else:
1391                         timescale = segment_template.get('timescale')
1392                         if timescale:
1393                             ms_info['timescale'] = int(timescale)
1394                         segment_duration = segment_template.get('duration')
1395                         if segment_duration:
1396                             ms_info['segment_duration'] = int(segment_duration)
1397                     media_template = segment_template.get('media')
1398                     if media_template:
1399                         ms_info['media_template'] = media_template
1400                     initialization = segment_template.get('initialization')
1401                     if initialization:
1402                         ms_info['initialization_url'] = initialization
1403                     else:
1404                         initialization = segment_template.find(_add_ns('Initialization'))
1405                         if initialization is not None:
1406                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1407             return ms_info
1408
1409         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1410         formats = []
1411         for period in mpd_doc.findall(_add_ns('Period')):
1412             period_duration = parse_duration(period.get('duration')) or mpd_duration
1413             period_ms_info = extract_multisegment_info(period, {
1414                 'start_number': 1,
1415                 'timescale': 1,
1416             })
1417             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1418                 if is_drm_protected(adaptation_set):
1419                     continue
1420                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1421                 for representation in adaptation_set.findall(_add_ns('Representation')):
1422                     if is_drm_protected(representation):
1423                         continue
1424                     representation_attrib = adaptation_set.attrib.copy()
1425                     representation_attrib.update(representation.attrib)
1426                     mime_type = representation_attrib.get('mimeType')
1427                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1428                     if content_type == 'text':
1429                         # TODO implement WebVTT downloading
1430                         pass
1431                     elif content_type == 'video' or content_type == 'audio':
1432                         base_url = ''
1433                         for element in (representation, adaptation_set, period, mpd_doc):
1434                             base_url_e = element.find(_add_ns('BaseURL'))
1435                             if base_url_e is not None:
1436                                 base_url = base_url_e.text + base_url
1437                                 if re.match(r'^https?://', base_url):
1438                                     break
1439                         if not re.match(r'^https?://', base_url):
1440                             base_url = mpd_base_url + base_url
1441                         representation_id = representation_attrib.get('id')
1442                         lang = representation_attrib.get('lang')
1443                         f = {
1444                             'format_id': mpd_id or representation_id,
1445                             'url': base_url,
1446                             'width': int_or_none(representation_attrib.get('width')),
1447                             'height': int_or_none(representation_attrib.get('height')),
1448                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1449                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1450                             'fps': int_or_none(representation_attrib.get('frameRate')),
1451                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1452                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1453                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1454                             'format_note': 'DASH %s' % content_type,
1455                         }
1456                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1457                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1458                             if 'total_number' not in representation_ms_info and 'segment_duration':
1459                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1460                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1461                             media_template = representation_ms_info['media_template']
1462                             media_template = media_template.replace('$RepresentationID$', representation_id)
1463                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1464                             media_template.replace('$$', '$')
1465                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1466                         if 'segment_urls' in representation_ms_info:
1467                             f.update({
1468                                 'segment_urls': representation_ms_info['segment_urls'],
1469                                 'protocol': 'http_dash_segments',
1470                             })
1471                             if 'initialization_url' in representation_ms_info:
1472                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1473                                 f.update({
1474                                     'initialization_url': initialization_url,
1475                                 })
1476                                 if not f.get('url'):
1477                                     f['url'] = initialization_url
1478                         try:
1479                             existing_format = next(
1480                                 fo for fo in formats
1481                                 if fo['format_id'] == representation_id)
1482                         except StopIteration:
1483                             full_info = formats_dict.get(representation_id, {}).copy()
1484                             full_info.update(f)
1485                             formats.append(full_info)
1486                         else:
1487                             existing_format.update(f)
1488                     else:
1489                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1490         self._sort_formats(formats)
1491         return formats
1492
1493     def _live_title(self, name):
1494         """ Generate the title for a live video """
1495         now = datetime.datetime.now()
1496         now_str = now.strftime("%Y-%m-%d %H:%M")
1497         return name + ' ' + now_str
1498
1499     def _int(self, v, name, fatal=False, **kwargs):
1500         res = int_or_none(v, **kwargs)
1501         if 'get_attr' in kwargs:
1502             print(getattr(v, kwargs['get_attr']))
1503         if res is None:
1504             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1505             if fatal:
1506                 raise ExtractorError(msg)
1507             else:
1508                 self._downloader.report_warning(msg)
1509         return res
1510
1511     def _float(self, v, name, fatal=False, **kwargs):
1512         res = float_or_none(v, **kwargs)
1513         if res is None:
1514             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1515             if fatal:
1516                 raise ExtractorError(msg)
1517             else:
1518                 self._downloader.report_warning(msg)
1519         return res
1520
1521     def _set_cookie(self, domain, name, value, expire_time=None):
1522         cookie = compat_cookiejar.Cookie(
1523             0, name, value, None, None, domain, None,
1524             None, '/', True, False, expire_time, '', None, None, None)
1525         self._downloader.cookiejar.set_cookie(cookie)
1526
1527     def _get_cookies(self, url):
1528         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1529         req = sanitized_Request(url)
1530         self._downloader.cookiejar.add_cookie_header(req)
1531         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1532
1533     def get_testcases(self, include_onlymatching=False):
1534         t = getattr(self, '_TEST', None)
1535         if t:
1536             assert not hasattr(self, '_TESTS'), \
1537                 '%s has _TEST and _TESTS' % type(self).__name__
1538             tests = [t]
1539         else:
1540             tests = getattr(self, '_TESTS', [])
1541         for t in tests:
1542             if not include_onlymatching and t.get('only_matching', False):
1543                 continue
1544             t['name'] = type(self).__name__[:-len('IE')]
1545             yield t
1546
1547     def is_suitable(self, age_limit):
1548         """ Test whether the extractor is generally suitable for the given
1549         age limit (i.e. pornographic sites are not, all others usually are) """
1550
1551         any_restricted = False
1552         for tc in self.get_testcases(include_onlymatching=False):
1553             if 'playlist' in tc:
1554                 tc = tc['playlist'][0]
1555             is_restricted = age_restricted(
1556                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1557             if not is_restricted:
1558                 return True
1559             any_restricted = any_restricted or is_restricted
1560         return not any_restricted
1561
1562     def extract_subtitles(self, *args, **kwargs):
1563         if (self._downloader.params.get('writesubtitles', False) or
1564                 self._downloader.params.get('listsubtitles')):
1565             return self._get_subtitles(*args, **kwargs)
1566         return {}
1567
1568     def _get_subtitles(self, *args, **kwargs):
1569         raise NotImplementedError("This method must be implemented by subclasses")
1570
1571     @staticmethod
1572     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1573         """ Merge subtitle items for one language. Items with duplicated URLs
1574         will be dropped. """
1575         list1_urls = set([item['url'] for item in subtitle_list1])
1576         ret = list(subtitle_list1)
1577         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1578         return ret
1579
1580     @classmethod
1581     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1582         """ Merge two subtitle dictionaries, language by language. """
1583         ret = dict(subtitle_dict1)
1584         for lang in subtitle_dict2:
1585             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1586         return ret
1587
1588     def extract_automatic_captions(self, *args, **kwargs):
1589         if (self._downloader.params.get('writeautomaticsub', False) or
1590                 self._downloader.params.get('listsubtitles')):
1591             return self._get_automatic_captions(*args, **kwargs)
1592         return {}
1593
1594     def _get_automatic_captions(self, *args, **kwargs):
1595         raise NotImplementedError("This method must be implemented by subclasses")
1596
1597
1598 class SearchInfoExtractor(InfoExtractor):
1599     """
1600     Base class for paged search queries extractors.
1601     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1602     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1603     """
1604
1605     @classmethod
1606     def _make_valid_url(cls):
1607         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1608
1609     @classmethod
1610     def suitable(cls, url):
1611         return re.match(cls._make_valid_url(), url) is not None
1612
1613     def _real_extract(self, query):
1614         mobj = re.match(self._make_valid_url(), query)
1615         if mobj is None:
1616             raise ExtractorError('Invalid search query "%s"' % query)
1617
1618         prefix = mobj.group('prefix')
1619         query = mobj.group('query')
1620         if prefix == '':
1621             return self._get_n_results(query, 1)
1622         elif prefix == 'all':
1623             return self._get_n_results(query, self._MAX_RESULTS)
1624         else:
1625             n = int(prefix)
1626             if n <= 0:
1627                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1628             elif n > self._MAX_RESULTS:
1629                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1630                 n = self._MAX_RESULTS
1631             return self._get_n_results(query, n)
1632
1633     def _get_n_results(self, query, n):
1634         """Get a specified number of results for a query"""
1635         raise NotImplementedError("This method must be implemented by subclasses")
1636
1637     @property
1638     def SEARCH_KEY(self):
1639         return self._SEARCH_KEY