_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49 )
  50
  51
  52 class InfoExtractor(object):
  53     """Information Extractor class.
  54
  55     Information extractors are the classes that, given a URL, extract
  56     information about the video (or videos) the URL refers to. This
  57     information includes the real video URL, the video title, author and
  58     others. The information is stored in a dictionary which is then
  59     passed to the YoutubeDL. The YoutubeDL processes this
  60     information possibly downloading the video to the file system, among
  61     other possible outcomes.
  62
  63     The type field determines the type of the result.
  64     By far the most common value (and the default if _type is missing) is
  65     "video", which indicates a single video.
  66
  67     For a video, the dictionaries must include the following fields:
  68
  69     id:             Video identifier.
  70     title:          Video title, unescaped.
  71
  72     Additionally, it must contain either a formats entry or a url one:
  73
  74     formats:        A list of dictionaries for each format available, ordered
  75                     from worst to best quality.
  76
  77                     Potential fields:
  78                     * url        Mandatory. The URL of the video file
  79                     * ext        Will be calculated from URL if missing
  80                     * format     A human-readable description of the format
  81                                  ("mp4 container with h264/opus").
  82                                  Calculated from the format_id, width, height.
  83                                  and format_note fields if missing.
  84                     * format_id  A short description of the format
  85                                  ("mp4_h264_opus" or "19").
  86                                 Technically optional, but strongly recommended.
  87                     * format_note Additional info about the format
  88                                  ("3D" or "DASH video")
  89                     * width      Width of the video, if known
  90                     * height     Height of the video, if known
  91                     * resolution Textual description of width and height
  92                     * tbr        Average bitrate of audio and video in KBit/s
  93                     * abr        Average audio bitrate in KBit/s
  94                     * acodec     Name of the audio codec in use
  95                     * asr        Audio sampling rate in Hertz
  96                     * vbr        Average video bitrate in KBit/s
  97                     * fps        Frame rate
  98                     * vcodec     Name of the video codec in use
  99                     * container  Name of the container format
 100                     * filesize   The number of bytes, if known in advance
 101                     * filesize_approx  An estimate for the number of bytes
 102                     * player_url SWF Player URL (used for rtmpdump).
 103                     * protocol   The protocol that will be used for the actual
 104                                  download, lower-case.
 105                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 106                                  "m3u8", or "m3u8_native".
 107                     * preference Order number of this format. If this field is
 108                                  present and not None, the formats get sorted
 109                                  by this field, regardless of all other values.
 110                                  -1 for default (order by other properties),
 111                                  -2 or smaller for less than default.
 112                                  < -1000 to hide the format (if there is
 113                                     another one which is strictly better)
 114                     * language   Language code, e.g. "de" or "en-US".
 115                     * language_preference  Is this in the language mentioned in
 116                                  the URL?
 117                                  10 if it's what the URL is about,
 118                                  -1 for default (don't know),
 119                                  -10 otherwise, other values reserved for now.
 120                     * quality    Order number of the video quality of this
 121                                  format, irrespective of the file format.
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * source_preference  Order number for this video source
 125                                   (quality takes higher priority)
 126                                  -1 for default (order by other properties),
 127                                  -2 or smaller for less than default.
 128                     * http_headers  A dictionary of additional HTTP headers
 129                                  to add to the request.
 130                     * stretched_ratio  If given and not 1, indicates that the
 131                                  video's pixels are not square.
 132                                  width : height ratio as float.
 133                     * no_resume  The server does not support resuming the
 134                                  (HTTP or RTMP) download. Boolean.
 135
 136     url:            Final video URL.
 137     ext:            Video filename extension.
 138     format:         The video format, defaults to ext (used for --get-format)
 139     player_url:     SWF Player URL (used for rtmpdump).
 140
 141     The following fields are optional:
 142
 143     alt_title:      A secondary title of the video.
 144     display_id      An alternative identifier for the video, not necessarily
 145                     unique, but available before title. Typically, id is
 146                     something like "4234987", title "Dancing naked mole rats",
 147                     and display_id "dancing-naked-mole-rats"
 148     thumbnails:     A list of dictionaries, with the following entries:
 149                         * "id" (optional, string) - Thumbnail format ID
 150                         * "url"
 151                         * "preference" (optional, int) - quality of the image
 152                         * "width" (optional, int)
 153                         * "height" (optional, int)
 154                         * "resolution" (optional, string "{width}x{height"},
 155                                         deprecated)
 156     thumbnail:      Full URL to a video thumbnail image.
 157     description:    Full video description.
 158     uploader:       Full name of the video uploader.
 159     creator:        The main artist who created the video.
 160     release_date:   The date (YYYYMMDD) when the video was released.
 161     timestamp:      UNIX timestamp of the moment the video became available.
 162     upload_date:    Video upload date (YYYYMMDD).
 163                     If not explicitly set, calculated from timestamp.
 164     uploader_id:    Nickname or id of the video uploader.
 165     location:       Physical location where the video was filmed.
 166     subtitles:      The available subtitles as a dictionary in the format
 167                     {language: subformats}. "subformats" is a list sorted from
 168                     lower to higher preference, each element is a dictionary
 169                     with the "ext" entry and one of:
 170                         * "data": The subtitles file contents
 171                         * "url": A URL pointing to the subtitles file
 172                     "ext" will be calculated from URL if missing
 173     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 174                     automatically generated captions
 175     duration:       Length of the video in seconds, as an integer or float.
 176     view_count:     How many users have watched the video on the platform.
 177     like_count:     Number of positive ratings of the video
 178     dislike_count:  Number of negative ratings of the video
 179     repost_count:   Number of reposts of the video
 180     average_rating: Average rating give by users, the scale used depends on the webpage
 181     comment_count:  Number of comments on the video
 182     comments:       A list of comments, each with one or more of the following
 183                     properties (all but one of text or html optional):
 184                         * "author" - human-readable name of the comment author
 185                         * "author_id" - user ID of the comment author
 186                         * "id" - Comment ID
 187                         * "html" - Comment as HTML
 188                         * "text" - Plain text of the comment
 189                         * "timestamp" - UNIX timestamp of comment
 190                         * "parent" - ID of the comment this one is replying to.
 191                                      Set to "root" to indicate that this is a
 192                                      comment to the original video.
 193     age_limit:      Age restriction for the video, as an integer (years)
 194     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 195                     should allow to get the same result again. (It will be set
 196                     by YoutubeDL if it's missing)
 197     categories:     A list of categories that the video falls in, for example
 198                     ["Sports", "Berlin"]
 199     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 200     is_live:        True, False, or None (=unknown). Whether this video is a
 201                     live stream that goes on instead of a fixed-length video.
 202     start_time:     Time in seconds where the reproduction should start, as
 203                     specified in the URL.
 204     end_time:       Time in seconds where the reproduction should end, as
 205                     specified in the URL.
 206
 207     The following fields should only be used when the video belongs to some logical
 208     chapter or section:
 209
 210     chapter:        Name or title of the chapter the video belongs to.
 211     chapter_number: Number of the chapter the video belongs to, as an integer.
 212     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 213
 214     The following fields should only be used when the video is an episode of some
 215     series or programme:
 216
 217     series:         Title of the series or programme the video episode belongs to.
 218     season:         Title of the season the video episode belongs to.
 219     season_number:  Number of the season the video episode belongs to, as an integer.
 220     season_id:      Id of the season the video episode belongs to, as a unicode string.
 221     episode:        Title of the video episode. Unlike mandatory video title field,
 222                     this field should denote the exact title of the video episode
 223                     without any kind of decoration.
 224     episode_number: Number of the video episode within a season, as an integer.
 225     episode_id:     Id of the video episode, as a unicode string.
 226
 227     Unless mentioned otherwise, the fields should be Unicode strings.
 228
 229     Unless mentioned otherwise, None is equivalent to absence of information.
 230
 231
 232     _type "playlist" indicates multiple videos.
 233     There must be a key "entries", which is a list, an iterable, or a PagedList
 234     object, each element of which is a valid dictionary by this specification.
 235
 236     Additionally, playlists can have "title", "description" and "id" attributes
 237     with the same semantics as videos (see above).
 238
 239
 240     _type "multi_video" indicates that there are multiple videos that
 241     form a single show, for examples multiple acts of an opera or TV episode.
 242     It must have an entries key like a playlist and contain all the keys
 243     required for a video at the same time.
 244
 245
 246     _type "url" indicates that the video must be extracted from another
 247     location, possibly by a different extractor. Its only required key is:
 248     "url" - the next URL to extract.
 249     The key "ie_key" can be set to the class name (minus the trailing "IE",
 250     e.g. "Youtube") if the extractor class is known in advance.
 251     Additionally, the dictionary may have any properties of the resolved entity
 252     known in advance, for example "title" if the title of the referred video is
 253     known ahead of time.
 254
 255
 256     _type "url_transparent" entities have the same specification as "url", but
 257     indicate that the given additional information is more precise than the one
 258     associated with the resolved URL.
 259     This is useful when a site employs a video service that hosts the video and
 260     its technical metadata, but that video service does not embed a useful
 261     title, description etc.
 262
 263
 264     Subclasses of this one should re-define the _real_initialize() and
 265     _real_extract() methods and define a _VALID_URL regexp.
 266     Probably, they should also be added to the list of extractors.
 267
 268     Finally, the _WORKING attribute should be set to False for broken IEs
 269     in order to warn the users and skip the tests.
 270     """
 271
 272     _ready = False
 273     _downloader = None
 274     _WORKING = True
 275
 276     def __init__(self, downloader=None):
 277         """Constructor. Receives an optional downloader."""
 278         self._ready = False
 279         self.set_downloader(downloader)
 280
 281     @classmethod
 282     def suitable(cls, url):
 283         """Receives a URL and returns True if suitable for this IE."""
 284
 285         # This does not use has/getattr intentionally - we want to know whether
 286         # we have cached the regexp for *this* class, whereas getattr would also
 287         # match the superclass
 288         if '_VALID_URL_RE' not in cls.__dict__:
 289             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 290         return cls._VALID_URL_RE.match(url) is not None
 291
 292     @classmethod
 293     def _match_id(cls, url):
 294         if '_VALID_URL_RE' not in cls.__dict__:
 295             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 296         m = cls._VALID_URL_RE.match(url)
 297         assert m
 298         return m.group('id')
 299
 300     @classmethod
 301     def working(cls):
 302         """Getter method for _WORKING."""
 303         return cls._WORKING
 304
 305     def initialize(self):
 306         """Initializes an instance (authentication, etc)."""
 307         if not self._ready:
 308             self._real_initialize()
 309             self._ready = True
 310
 311     def extract(self, url):
 312         """Extracts URL information and returns it in list of dicts."""
 313         try:
 314             self.initialize()
 315             return self._real_extract(url)
 316         except ExtractorError:
 317             raise
 318         except compat_http_client.IncompleteRead as e:
 319             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 320         except (KeyError, StopIteration) as e:
 321             raise ExtractorError('An extractor error has occurred.', cause=e)
 322
 323     def set_downloader(self, downloader):
 324         """Sets the downloader for this IE."""
 325         self._downloader = downloader
 326
 327     def _real_initialize(self):
 328         """Real initialization process. Redefine in subclasses."""
 329         pass
 330
 331     def _real_extract(self, url):
 332         """Real extraction process. Redefine in subclasses."""
 333         pass
 334
 335     @classmethod
 336     def ie_key(cls):
 337         """A string for getting the InfoExtractor with get_info_extractor"""
 338         return compat_str(cls.__name__[:-2])
 339
 340     @property
 341     def IE_NAME(self):
 342         return compat_str(type(self).__name__[:-2])
 343
 344     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 345         """ Returns the response handle """
 346         if note is None:
 347             self.report_download_webpage(video_id)
 348         elif note is not False:
 349             if video_id is None:
 350                 self.to_screen('%s' % (note,))
 351             else:
 352                 self.to_screen('%s: %s' % (video_id, note))
 353         try:
 354             return self._downloader.urlopen(url_or_request)
 355         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 356             if errnote is False:
 357                 return False
 358             if errnote is None:
 359                 errnote = 'Unable to download webpage'
 360
 361             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 362             if fatal:
 363                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 364             else:
 365                 self._downloader.report_warning(errmsg)
 366                 return False
 367
 368     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 369         """ Returns a tuple (page content as string, URL handle) """
 370         # Strip hashes from the URL (#1038)
 371         if isinstance(url_or_request, (compat_str, str)):
 372             url_or_request = url_or_request.partition('#')[0]
 373
 374         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 375         if urlh is False:
 376             assert not fatal
 377             return False
 378         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 379         return (content, urlh)
 380
 381     @staticmethod
 382     def _guess_encoding_from_content(content_type, webpage_bytes):
 383         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 384         if m:
 385             encoding = m.group(1)
 386         else:
 387             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 388                           webpage_bytes[:1024])
 389             if m:
 390                 encoding = m.group(1).decode('ascii')
 391             elif webpage_bytes.startswith(b'\xff\xfe'):
 392                 encoding = 'utf-16'
 393             else:
 394                 encoding = 'utf-8'
 395
 396         return encoding
 397
 398     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 399         content_type = urlh.headers.get('Content-Type', '')
 400         webpage_bytes = urlh.read()
 401         if prefix is not None:
 402             webpage_bytes = prefix + webpage_bytes
 403         if not encoding:
 404             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 405         if self._downloader.params.get('dump_intermediate_pages', False):
 406             try:
 407                 url = url_or_request.get_full_url()
 408             except AttributeError:
 409                 url = url_or_request
 410             self.to_screen('Dumping request to ' + url)
 411             dump = base64.b64encode(webpage_bytes).decode('ascii')
 412             self._downloader.to_screen(dump)
 413         if self._downloader.params.get('write_pages', False):
 414             try:
 415                 url = url_or_request.get_full_url()
 416             except AttributeError:
 417                 url = url_or_request
 418             basen = '%s_%s' % (video_id, url)
 419             if len(basen) > 240:
 420                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 421                 basen = basen[:240 - len(h)] + h
 422             raw_filename = basen + '.dump'
 423             filename = sanitize_filename(raw_filename, restricted=True)
 424             self.to_screen('Saving request to ' + filename)
 425             # Working around MAX_PATH limitation on Windows (see
 426             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 427             if os.name == 'nt':
 428                 absfilepath = os.path.abspath(filename)
 429                 if len(absfilepath) > 259:
 430                     filename = '\\\\?\\' + absfilepath
 431             with open(filename, 'wb') as outf:
 432                 outf.write(webpage_bytes)
 433
 434         try:
 435             content = webpage_bytes.decode(encoding, 'replace')
 436         except LookupError:
 437             content = webpage_bytes.decode('utf-8', 'replace')
 438
 439         if ('<title>Access to this site is blocked</title>' in content and
 440                 'Websense' in content[:512]):
 441             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 442             blocked_iframe = self._html_search_regex(
 443                 r'<iframe src="([^"]+)"', content,
 444                 'Websense information URL', default=None)
 445             if blocked_iframe:
 446                 msg += ' Visit %s for more details' % blocked_iframe
 447             raise ExtractorError(msg, expected=True)
 448         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 449             msg = (
 450                 'Access to this webpage has been blocked by Indian censorship. '
 451                 'Use a VPN or proxy server (with --proxy) to route around it.')
 452             block_msg = self._html_search_regex(
 453                 r'</h1><p>(.*?)</p>',
 454                 content, 'block message', default=None)
 455             if block_msg:
 456                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 457             raise ExtractorError(msg, expected=True)
 458
 459         return content
 460
 461     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 462         """ Returns the data of the page as a string """
 463         success = False
 464         try_count = 0
 465         while success is False:
 466             try:
 467                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 468                 success = True
 469             except compat_http_client.IncompleteRead as e:
 470                 try_count += 1
 471                 if try_count >= tries:
 472                     raise e
 473                 self._sleep(timeout, video_id)
 474         if res is False:
 475             return res
 476         else:
 477             content, _ = res
 478             return content
 479
 480     def _download_xml(self, url_or_request, video_id,
 481                       note='Downloading XML', errnote='Unable to download XML',
 482                       transform_source=None, fatal=True, encoding=None):
 483         """Return the xml as an xml.etree.ElementTree.Element"""
 484         xml_string = self._download_webpage(
 485             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 486         if xml_string is False:
 487             return xml_string
 488         if transform_source:
 489             xml_string = transform_source(xml_string)
 490         return compat_etree_fromstring(xml_string.encode('utf-8'))
 491
 492     def _download_json(self, url_or_request, video_id,
 493                        note='Downloading JSON metadata',
 494                        errnote='Unable to download JSON metadata',
 495                        transform_source=None,
 496                        fatal=True, encoding=None):
 497         json_string = self._download_webpage(
 498             url_or_request, video_id, note, errnote, fatal=fatal,
 499             encoding=encoding)
 500         if (not fatal) and json_string is False:
 501             return None
 502         return self._parse_json(
 503             json_string, video_id, transform_source=transform_source, fatal=fatal)
 504
 505     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 506         if transform_source:
 507             json_string = transform_source(json_string)
 508         try:
 509             return json.loads(json_string)
 510         except ValueError as ve:
 511             errmsg = '%s: Failed to parse JSON ' % video_id
 512             if fatal:
 513                 raise ExtractorError(errmsg, cause=ve)
 514             else:
 515                 self.report_warning(errmsg + str(ve))
 516
 517     def report_warning(self, msg, video_id=None):
 518         idstr = '' if video_id is None else '%s: ' % video_id
 519         self._downloader.report_warning(
 520             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 521
 522     def to_screen(self, msg):
 523         """Print msg to screen, prefixing it with '[ie_name]'"""
 524         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 525
 526     def report_extraction(self, id_or_name):
 527         """Report information extraction."""
 528         self.to_screen('%s: Extracting information' % id_or_name)
 529
 530     def report_download_webpage(self, video_id):
 531         """Report webpage download."""
 532         self.to_screen('%s: Downloading webpage' % video_id)
 533
 534     def report_age_confirmation(self):
 535         """Report attempt to confirm age."""
 536         self.to_screen('Confirming age')
 537
 538     def report_login(self):
 539         """Report attempt to log in."""
 540         self.to_screen('Logging in')
 541
 542     @staticmethod
 543     def raise_login_required(msg='This video is only available for registered users'):
 544         raise ExtractorError(
 545             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 546             expected=True)
 547
 548     @staticmethod
 549     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 550         raise ExtractorError(
 551             '%s. You might want to use --proxy to workaround.' % msg,
 552             expected=True)
 553
 554     # Methods for following #608
 555     @staticmethod
 556     def url_result(url, ie=None, video_id=None, video_title=None):
 557         """Returns a URL that points to a page that should be processed"""
 558         # TODO: ie should be the class used for getting the info
 559         video_info = {'_type': 'url',
 560                       'url': url,
 561                       'ie_key': ie}
 562         if video_id is not None:
 563             video_info['id'] = video_id
 564         if video_title is not None:
 565             video_info['title'] = video_title
 566         return video_info
 567
 568     @staticmethod
 569     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 570         """Returns a playlist"""
 571         video_info = {'_type': 'playlist',
 572                       'entries': entries}
 573         if playlist_id:
 574             video_info['id'] = playlist_id
 575         if playlist_title:
 576             video_info['title'] = playlist_title
 577         if playlist_description:
 578             video_info['description'] = playlist_description
 579         return video_info
 580
 581     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 582         """
 583         Perform a regex search on the given string, using a single or a list of
 584         patterns returning the first matching group.
 585         In case of failure return a default value or raise a WARNING or a
 586         RegexNotFoundError, depending on fatal, specifying the field name.
 587         """
 588         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 589             mobj = re.search(pattern, string, flags)
 590         else:
 591             for p in pattern:
 592                 mobj = re.search(p, string, flags)
 593                 if mobj:
 594                     break
 595
 596         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 597             _name = '\033[0;34m%s\033[0m' % name
 598         else:
 599             _name = name
 600
 601         if mobj:
 602             if group is None:
 603                 # return the first matching group
 604                 return next(g for g in mobj.groups() if g is not None)
 605             else:
 606                 return mobj.group(group)
 607         elif default is not NO_DEFAULT:
 608             return default
 609         elif fatal:
 610             raise RegexNotFoundError('Unable to extract %s' % _name)
 611         else:
 612             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 613             return None
 614
 615     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 616         """
 617         Like _search_regex, but strips HTML tags and unescapes entities.
 618         """
 619         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 620         if res:
 621             return clean_html(res).strip()
 622         else:
 623             return res
 624
 625     def _get_login_info(self):
 626         """
 627         Get the login info as (username, password)
 628         It will look in the netrc file using the _NETRC_MACHINE value
 629         If there's no info available, return (None, None)
 630         """
 631         if self._downloader is None:
 632             return (None, None)
 633
 634         username = None
 635         password = None
 636         downloader_params = self._downloader.params
 637
 638         # Attempt to use provided username and password or .netrc data
 639         if downloader_params.get('username') is not None:
 640             username = downloader_params['username']
 641             password = downloader_params['password']
 642         elif downloader_params.get('usenetrc', False):
 643             try:
 644                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 645                 if info is not None:
 646                     username = info[0]
 647                     password = info[2]
 648                 else:
 649                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 650             except (IOError, netrc.NetrcParseError) as err:
 651                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 652
 653         return (username, password)
 654
 655     def _get_tfa_info(self, note='two-factor verification code'):
 656         """
 657         Get the two-factor authentication info
 658         TODO - asking the user will be required for sms/phone verify
 659         currently just uses the command line option
 660         If there's no info available, return None
 661         """
 662         if self._downloader is None:
 663             return None
 664         downloader_params = self._downloader.params
 665
 666         if downloader_params.get('twofactor') is not None:
 667             return downloader_params['twofactor']
 668
 669         return compat_getpass('Type %s and press [Return]: ' % note)
 670
 671     # Helper functions for extracting OpenGraph info
 672     @staticmethod
 673     def _og_regexes(prop):
 674         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 675         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 676                        % {'prop': re.escape(prop)})
 677         template = r'<meta[^>]+?%s[^>]+?%s'
 678         return [
 679             template % (property_re, content_re),
 680             template % (content_re, property_re),
 681         ]
 682
 683     @staticmethod
 684     def _meta_regex(prop):
 685         return r'''(?isx)<meta
 686                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 687                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 688
 689     def _og_search_property(self, prop, html, name=None, **kargs):
 690         if name is None:
 691             name = 'OpenGraph %s' % prop
 692         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 693         if escaped is None:
 694             return None
 695         return unescapeHTML(escaped)
 696
 697     def _og_search_thumbnail(self, html, **kargs):
 698         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 699
 700     def _og_search_description(self, html, **kargs):
 701         return self._og_search_property('description', html, fatal=False, **kargs)
 702
 703     def _og_search_title(self, html, **kargs):
 704         return self._og_search_property('title', html, **kargs)
 705
 706     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 707         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 708         if secure:
 709             regexes = self._og_regexes('video:secure_url') + regexes
 710         return self._html_search_regex(regexes, html, name, **kargs)
 711
 712     def _og_search_url(self, html, **kargs):
 713         return self._og_search_property('url', html, **kargs)
 714
 715     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 716         if display_name is None:
 717             display_name = name
 718         return self._html_search_regex(
 719             self._meta_regex(name),
 720             html, display_name, fatal=fatal, group='content', **kwargs)
 721
 722     def _dc_search_uploader(self, html):
 723         return self._html_search_meta('dc.creator', html, 'uploader')
 724
 725     def _rta_search(self, html):
 726         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 727         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 728                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 729                      html):
 730             return 18
 731         return 0
 732
 733     def _media_rating_search(self, html):
 734         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 735         rating = self._html_search_meta('rating', html)
 736
 737         if not rating:
 738             return None
 739
 740         RATING_TABLE = {
 741             'safe for kids': 0,
 742             'general': 8,
 743             '14 years': 14,
 744             'mature': 17,
 745             'restricted': 19,
 746         }
 747         return RATING_TABLE.get(rating.lower())
 748
 749     def _family_friendly_search(self, html):
 750         # See http://schema.org/VideoObject
 751         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 752
 753         if not family_friendly:
 754             return None
 755
 756         RATING_TABLE = {
 757             '1': 0,
 758             'true': 0,
 759             '0': 18,
 760             'false': 18,
 761         }
 762         return RATING_TABLE.get(family_friendly.lower())
 763
 764     def _twitter_search_player(self, html):
 765         return self._html_search_meta('twitter:player', html,
 766                                       'twitter card player')
 767
 768     def _search_json_ld(self, html, video_id, **kwargs):
 769         json_ld = self._search_regex(
 770             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 771             html, 'JSON-LD', group='json_ld', **kwargs)
 772         if not json_ld:
 773             return {}
 774         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 775
 776     def _json_ld(self, json_ld, video_id, fatal=True):
 777         if isinstance(json_ld, compat_str):
 778             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 779         if not json_ld:
 780             return {}
 781         info = {}
 782         if json_ld.get('@context') == 'http://schema.org':
 783             item_type = json_ld.get('@type')
 784             if item_type == 'TVEpisode':
 785                 info.update({
 786                     'episode': unescapeHTML(json_ld.get('name')),
 787                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 788                     'description': unescapeHTML(json_ld.get('description')),
 789                 })
 790                 part_of_season = json_ld.get('partOfSeason')
 791                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 792                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 793                 part_of_series = json_ld.get('partOfSeries')
 794                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 795                     info['series'] = unescapeHTML(part_of_series.get('name'))
 796             elif item_type == 'Article':
 797                 info.update({
 798                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 799                     'title': unescapeHTML(json_ld.get('headline')),
 800                     'description': unescapeHTML(json_ld.get('articleBody')),
 801                 })
 802         return dict((k, v) for k, v in info.items() if v is not None)
 803
 804     @staticmethod
 805     def _hidden_inputs(html):
 806         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 807         hidden_inputs = {}
 808         for input in re.findall(r'(?i)<input([^>]+)>', html):
 809             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 810                 continue
 811             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 812             if not name:
 813                 continue
 814             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 815             if not value:
 816                 continue
 817             hidden_inputs[name.group('value')] = value.group('value')
 818         return hidden_inputs
 819
 820     def _form_hidden_inputs(self, form_id, html):
 821         form = self._search_regex(
 822             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 823             html, '%s form' % form_id, group='form')
 824         return self._hidden_inputs(form)
 825
 826     def _sort_formats(self, formats, field_preference=None):
 827         if not formats:
 828             raise ExtractorError('No video formats found')
 829
 830         for f in formats:
 831             # Automatically determine tbr when missing based on abr and vbr (improves
 832             # formats sorting in some cases)
 833             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 834                 f['tbr'] = f['abr'] + f['vbr']
 835
 836         def _formats_key(f):
 837             # TODO remove the following workaround
 838             from ..utils import determine_ext
 839             if not f.get('ext') and 'url' in f:
 840                 f['ext'] = determine_ext(f['url'])
 841
 842             if isinstance(field_preference, (list, tuple)):
 843                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 844
 845             preference = f.get('preference')
 846             if preference is None:
 847                 preference = 0
 848                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 849                     preference -= 0.5
 850
 851             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 852
 853             if f.get('vcodec') == 'none':  # audio only
 854                 if self._downloader.params.get('prefer_free_formats'):
 855                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 856                 else:
 857                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 858                 ext_preference = 0
 859                 try:
 860                     audio_ext_preference = ORDER.index(f['ext'])
 861                 except ValueError:
 862                     audio_ext_preference = -1
 863             else:
 864                 if self._downloader.params.get('prefer_free_formats'):
 865                     ORDER = ['flv', 'mp4', 'webm']
 866                 else:
 867                     ORDER = ['webm', 'flv', 'mp4']
 868                 try:
 869                     ext_preference = ORDER.index(f['ext'])
 870                 except ValueError:
 871                     ext_preference = -1
 872                 audio_ext_preference = 0
 873
 874             return (
 875                 preference,
 876                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 877                 f.get('quality') if f.get('quality') is not None else -1,
 878                 f.get('tbr') if f.get('tbr') is not None else -1,
 879                 f.get('filesize') if f.get('filesize') is not None else -1,
 880                 f.get('vbr') if f.get('vbr') is not None else -1,
 881                 f.get('height') if f.get('height') is not None else -1,
 882                 f.get('width') if f.get('width') is not None else -1,
 883                 proto_preference,
 884                 ext_preference,
 885                 f.get('abr') if f.get('abr') is not None else -1,
 886                 audio_ext_preference,
 887                 f.get('fps') if f.get('fps') is not None else -1,
 888                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 889                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 890                 f.get('format_id') if f.get('format_id') is not None else '',
 891             )
 892         formats.sort(key=_formats_key)
 893
 894     def _check_formats(self, formats, video_id):
 895         if formats:
 896             formats[:] = filter(
 897                 lambda f: self._is_valid_url(
 898                     f['url'], video_id,
 899                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 900                 formats)
 901
 902     def _is_valid_url(self, url, video_id, item='video'):
 903         url = self._proto_relative_url(url, scheme='http:')
 904         # For now assume non HTTP(S) URLs always valid
 905         if not (url.startswith('http://') or url.startswith('https://')):
 906             return True
 907         try:
 908             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 909             return True
 910         except ExtractorError as e:
 911             if isinstance(e.cause, compat_urllib_error.URLError):
 912                 self.to_screen(
 913                     '%s: %s URL is invalid, skipping' % (video_id, item))
 914                 return False
 915             raise
 916
 917     def http_scheme(self):
 918         """ Either "http:" or "https:", depending on the user's preferences """
 919         return (
 920             'http:'
 921             if self._downloader.params.get('prefer_insecure', False)
 922             else 'https:')
 923
 924     def _proto_relative_url(self, url, scheme=None):
 925         if url is None:
 926             return url
 927         if url.startswith('//'):
 928             if scheme is None:
 929                 scheme = self.http_scheme()
 930             return scheme + url
 931         else:
 932             return url
 933
 934     def _sleep(self, timeout, video_id, msg_template=None):
 935         if msg_template is None:
 936             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 937         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 938         self.to_screen(msg)
 939         time.sleep(timeout)
 940
 941     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 942                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 943                              fatal=True):
 944         manifest = self._download_xml(
 945             manifest_url, video_id, 'Downloading f4m manifest',
 946             'Unable to download f4m manifest',
 947             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 948             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 949             transform_source=transform_source,
 950             fatal=fatal)
 951
 952         if manifest is False:
 953             return []
 954
 955         formats = []
 956         manifest_version = '1.0'
 957         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 958         if not media_nodes:
 959             manifest_version = '2.0'
 960             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 961         base_url = xpath_text(
 962             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 963             'base URL', default=None)
 964         if base_url:
 965             base_url = base_url.strip()
 966         for i, media_el in enumerate(media_nodes):
 967             if manifest_version == '2.0':
 968                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 969                 if not media_url:
 970                     continue
 971                 manifest_url = (
 972                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 973                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 974                 # If media_url is itself a f4m manifest do the recursive extraction
 975                 # since bitrates in parent manifest (this one) and media_url manifest
 976                 # may differ leading to inability to resolve the format by requested
 977                 # bitrate in f4m downloader
 978                 if determine_ext(manifest_url) == 'f4m':
 979                     formats.extend(self._extract_f4m_formats(
 980                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 981                     continue
 982             tbr = int_or_none(media_el.attrib.get('bitrate'))
 983             formats.append({
 984                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 985                 'url': manifest_url,
 986                 'ext': 'flv',
 987                 'tbr': tbr,
 988                 'width': int_or_none(media_el.attrib.get('width')),
 989                 'height': int_or_none(media_el.attrib.get('height')),
 990                 'preference': preference,
 991             })
 992         self._sort_formats(formats)
 993
 994         return formats
 995
 996     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 997                               entry_protocol='m3u8', preference=None,
 998                               m3u8_id=None, note=None, errnote=None,
 999                               fatal=True):
1000
1001         formats = [{
1002             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1003             'url': m3u8_url,
1004             'ext': ext,
1005             'protocol': 'm3u8',
1006             'preference': preference - 1 if preference else -1,
1007             'resolution': 'multiple',
1008             'format_note': 'Quality selection URL',
1009         }]
1010
1011         format_url = lambda u: (
1012             u
1013             if re.match(r'^https?://', u)
1014             else compat_urlparse.urljoin(m3u8_url, u))
1015
1016         res = self._download_webpage_handle(
1017             m3u8_url, video_id,
1018             note=note or 'Downloading m3u8 information',
1019             errnote=errnote or 'Failed to download m3u8 information',
1020             fatal=fatal)
1021         if res is False:
1022             return []
1023         m3u8_doc, urlh = res
1024         m3u8_url = urlh.geturl()
1025         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1026         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1027         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1028         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1029         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1030             return [{
1031                 'url': m3u8_url,
1032                 'format_id': m3u8_id,
1033                 'ext': ext,
1034                 'protocol': entry_protocol,
1035                 'preference': preference,
1036             }]
1037         last_info = None
1038         last_media = None
1039         kv_rex = re.compile(
1040             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1041         for line in m3u8_doc.splitlines():
1042             if line.startswith('#EXT-X-STREAM-INF:'):
1043                 last_info = {}
1044                 for m in kv_rex.finditer(line):
1045                     v = m.group('val')
1046                     if v.startswith('"'):
1047                         v = v[1:-1]
1048                     last_info[m.group('key')] = v
1049             elif line.startswith('#EXT-X-MEDIA:'):
1050                 last_media = {}
1051                 for m in kv_rex.finditer(line):
1052                     v = m.group('val')
1053                     if v.startswith('"'):
1054                         v = v[1:-1]
1055                     last_media[m.group('key')] = v
1056             elif line.startswith('#') or not line.strip():
1057                 continue
1058             else:
1059                 if last_info is None:
1060                     formats.append({'url': format_url(line)})
1061                     continue
1062                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1063                 format_id = []
1064                 if m3u8_id:
1065                     format_id.append(m3u8_id)
1066                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1067                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1068                 f = {
1069                     'format_id': '-'.join(format_id),
1070                     'url': format_url(line.strip()),
1071                     'tbr': tbr,
1072                     'ext': ext,
1073                     'protocol': entry_protocol,
1074                     'preference': preference,
1075                 }
1076                 codecs = last_info.get('CODECS')
1077                 if codecs:
1078                     # TODO: looks like video codec is not always necessarily goes first
1079                     va_codecs = codecs.split(',')
1080                     if va_codecs[0]:
1081                         f['vcodec'] = va_codecs[0]
1082                     if len(va_codecs) > 1 and va_codecs[1]:
1083                         f['acodec'] = va_codecs[1]
1084                 resolution = last_info.get('RESOLUTION')
1085                 if resolution:
1086                     width_str, height_str = resolution.split('x')
1087                     f['width'] = int(width_str)
1088                     f['height'] = int(height_str)
1089                 if last_media is not None:
1090                     f['m3u8_media'] = last_media
1091                     last_media = None
1092                 formats.append(f)
1093                 last_info = {}
1094         self._sort_formats(formats)
1095         return formats
1096
1097     @staticmethod
1098     def _xpath_ns(path, namespace=None):
1099         if not namespace:
1100             return path
1101         out = []
1102         for c in path.split('/'):
1103             if not c or c == '.':
1104                 out.append(c)
1105             else:
1106                 out.append('{%s}%s' % (namespace, c))
1107         return '/'.join(out)
1108
1109     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1110         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1111
1112         if smil is False:
1113             assert not fatal
1114             return []
1115
1116         namespace = self._parse_smil_namespace(smil)
1117
1118         return self._parse_smil_formats(
1119             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1120
1121     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1122         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1123         if smil is False:
1124             return {}
1125         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1126
1127     def _download_smil(self, smil_url, video_id, fatal=True):
1128         return self._download_xml(
1129             smil_url, video_id, 'Downloading SMIL file',
1130             'Unable to download SMIL file', fatal=fatal)
1131
1132     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1133         namespace = self._parse_smil_namespace(smil)
1134
1135         formats = self._parse_smil_formats(
1136             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1137         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1138
1139         video_id = os.path.splitext(url_basename(smil_url))[0]
1140         title = None
1141         description = None
1142         upload_date = None
1143         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1144             name = meta.attrib.get('name')
1145             content = meta.attrib.get('content')
1146             if not name or not content:
1147                 continue
1148             if not title and name == 'title':
1149                 title = content
1150             elif not description and name in ('description', 'abstract'):
1151                 description = content
1152             elif not upload_date and name == 'date':
1153                 upload_date = unified_strdate(content)
1154
1155         thumbnails = [{
1156             'id': image.get('type'),
1157             'url': image.get('src'),
1158             'width': int_or_none(image.get('width')),
1159             'height': int_or_none(image.get('height')),
1160         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1161
1162         return {
1163             'id': video_id,
1164             'title': title or video_id,
1165             'description': description,
1166             'upload_date': upload_date,
1167             'thumbnails': thumbnails,
1168             'formats': formats,
1169             'subtitles': subtitles,
1170         }
1171
1172     def _parse_smil_namespace(self, smil):
1173         return self._search_regex(
1174             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1175
1176     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1177         base = smil_url
1178         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179             b = meta.get('base') or meta.get('httpBase')
1180             if b:
1181                 base = b
1182                 break
1183
1184         formats = []
1185         rtmp_count = 0
1186         http_count = 0
1187         m3u8_count = 0
1188
1189         srcs = []
1190         videos = smil.findall(self._xpath_ns('.//video', namespace))
1191         for video in videos:
1192             src = video.get('src')
1193             if not src or src in srcs:
1194                 continue
1195             srcs.append(src)
1196
1197             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1198             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1199             width = int_or_none(video.get('width'))
1200             height = int_or_none(video.get('height'))
1201             proto = video.get('proto')
1202             ext = video.get('ext')
1203             src_ext = determine_ext(src)
1204             streamer = video.get('streamer') or base
1205
1206             if proto == 'rtmp' or streamer.startswith('rtmp'):
1207                 rtmp_count += 1
1208                 formats.append({
1209                     'url': streamer,
1210                     'play_path': src,
1211                     'ext': 'flv',
1212                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1213                     'tbr': bitrate,
1214                     'filesize': filesize,
1215                     'width': width,
1216                     'height': height,
1217                 })
1218                 if transform_rtmp_url:
1219                     streamer, src = transform_rtmp_url(streamer, src)
1220                     formats[-1].update({
1221                         'url': streamer,
1222                         'play_path': src,
1223                     })
1224                 continue
1225
1226             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1227             src_url = src_url.strip()
1228
1229             if proto == 'm3u8' or src_ext == 'm3u8':
1230                 m3u8_formats = self._extract_m3u8_formats(
1231                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1232                 if len(m3u8_formats) == 1:
1233                     m3u8_count += 1
1234                     m3u8_formats[0].update({
1235                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1236                         'tbr': bitrate,
1237                         'width': width,
1238                         'height': height,
1239                     })
1240                 formats.extend(m3u8_formats)
1241                 continue
1242
1243             if src_ext == 'f4m':
1244                 f4m_url = src_url
1245                 if not f4m_params:
1246                     f4m_params = {
1247                         'hdcore': '3.2.0',
1248                         'plugin': 'flowplayer-3.2.0.1',
1249                     }
1250                 f4m_url += '&' if '?' in f4m_url else '?'
1251                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1252                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1253                 continue
1254
1255             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1256                 http_count += 1
1257                 formats.append({
1258                     'url': src_url,
1259                     'ext': ext or src_ext or 'flv',
1260                     'format_id': 'http-%d' % (bitrate or http_count),
1261                     'tbr': bitrate,
1262                     'filesize': filesize,
1263                     'width': width,
1264                     'height': height,
1265                 })
1266                 continue
1267
1268         self._sort_formats(formats)
1269
1270         return formats
1271
1272     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1273         urls = []
1274         subtitles = {}
1275         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1276             src = textstream.get('src')
1277             if not src or src in urls:
1278                 continue
1279             urls.append(src)
1280             ext = textstream.get('ext') or determine_ext(src)
1281             if not ext:
1282                 type_ = textstream.get('type')
1283                 SUBTITLES_TYPES = {
1284                     'text/vtt': 'vtt',
1285                     'text/srt': 'srt',
1286                     'application/smptett+xml': 'tt',
1287                 }
1288                 if type_ in SUBTITLES_TYPES:
1289                     ext = SUBTITLES_TYPES[type_]
1290             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1291             subtitles.setdefault(lang, []).append({
1292                 'url': src,
1293                 'ext': ext,
1294             })
1295         return subtitles
1296
1297     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1298         xspf = self._download_xml(
1299             playlist_url, playlist_id, 'Downloading xpsf playlist',
1300             'Unable to download xspf manifest', fatal=fatal)
1301         if xspf is False:
1302             return []
1303         return self._parse_xspf(xspf, playlist_id)
1304
1305     def _parse_xspf(self, playlist, playlist_id):
1306         NS_MAP = {
1307             'xspf': 'http://xspf.org/ns/0/',
1308             's1': 'http://static.streamone.nl/player/ns/0',
1309         }
1310
1311         entries = []
1312         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1313             title = xpath_text(
1314                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1315             description = xpath_text(
1316                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1317             thumbnail = xpath_text(
1318                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1319             duration = float_or_none(
1320                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1321
1322             formats = [{
1323                 'url': location.text,
1324                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1325                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1326                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1327             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1328             self._sort_formats(formats)
1329
1330             entries.append({
1331                 'id': playlist_id,
1332                 'title': title,
1333                 'description': description,
1334                 'thumbnail': thumbnail,
1335                 'duration': duration,
1336                 'formats': formats,
1337             })
1338         return entries
1339
1340     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1341         res = self._download_webpage_handle(
1342             mpd_url, video_id,
1343             note=note or 'Downloading MPD manifest',
1344             errnote=errnote or 'Failed to download MPD manifest',
1345             fatal=fatal)
1346         if res is False:
1347             return []
1348         mpd, urlh = res
1349         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1350
1351         return self._parse_mpd_formats(
1352             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1353
1354     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1355         if mpd_doc.get('type') == 'dynamic':
1356             return []
1357
1358         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1359
1360         def _add_ns(path):
1361             return self._xpath_ns(path, namespace)
1362
1363         def is_drm_protected(element):
1364             return element.find(_add_ns('ContentProtection')) is not None
1365
1366         def extract_multisegment_info(element, ms_parent_info):
1367             ms_info = ms_parent_info.copy()
1368             segment_list = element.find(_add_ns('SegmentList'))
1369             if segment_list is not None:
1370                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1371                 if segment_urls_e:
1372                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1373                 initialization = segment_list.find(_add_ns('Initialization'))
1374                 if initialization is not None:
1375                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1376             else:
1377                 segment_template = element.find(_add_ns('SegmentTemplate'))
1378                 if segment_template is not None:
1379                     start_number = segment_template.get('startNumber')
1380                     if start_number:
1381                         ms_info['start_number'] = int(start_number)
1382                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1383                     if segment_timeline is not None:
1384                         s_e = segment_timeline.findall(_add_ns('S'))
1385                         if s_e:
1386                             ms_info['total_number'] = 0
1387                             for s in s_e:
1388                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1389                     else:
1390                         timescale = segment_template.get('timescale')
1391                         if timescale:
1392                             ms_info['timescale'] = int(timescale)
1393                         segment_duration = segment_template.get('duration')
1394                         if segment_duration:
1395                             ms_info['segment_duration'] = int(segment_duration)
1396                     media_template = segment_template.get('media')
1397                     if media_template:
1398                         ms_info['media_template'] = media_template
1399                     initialization = segment_template.get('initialization')
1400                     if initialization:
1401                         ms_info['initialization_url'] = initialization
1402                     else:
1403                         initialization = segment_template.find(_add_ns('Initialization'))
1404                         if initialization is not None:
1405                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1406             return ms_info
1407
1408         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1409         formats = []
1410         for period in mpd_doc.findall(_add_ns('Period')):
1411             period_duration = parse_duration(period.get('duration')) or mpd_duration
1412             period_ms_info = extract_multisegment_info(period, {
1413                 'start_number': 1,
1414                 'timescale': 1,
1415             })
1416             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1417                 if is_drm_protected(adaptation_set):
1418                     continue
1419                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1420                 for representation in adaptation_set.findall(_add_ns('Representation')):
1421                     if is_drm_protected(representation):
1422                         continue
1423                     representation_attrib = adaptation_set.attrib.copy()
1424                     representation_attrib.update(representation.attrib)
1425                     mime_type = representation_attrib.get('mimeType')
1426                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1427                     if content_type == 'text':
1428                         # TODO implement WebVTT downloading
1429                         pass
1430                     elif content_type == 'video' or content_type == 'audio':
1431                         base_url = ''
1432                         for element in (representation, adaptation_set, period, mpd_doc):
1433                             base_url_e = element.find(_add_ns('BaseURL'))
1434                             if base_url_e is not None:
1435                                 base_url = base_url_e.text + base_url
1436                                 if re.match(r'^https?://', base_url):
1437                                     break
1438                         if mpd_base_url and not re.match(r'^https?://', base_url):
1439                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1440                                 mpd_base_url += '/'
1441                             base_url = mpd_base_url + base_url
1442                         representation_id = representation_attrib.get('id')
1443                         lang = representation_attrib.get('lang')
1444                         url_el = representation.find(_add_ns('BaseURL'))
1445                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1446                         f = {
1447                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1448                             'url': base_url,
1449                             'width': int_or_none(representation_attrib.get('width')),
1450                             'height': int_or_none(representation_attrib.get('height')),
1451                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1452                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1453                             'fps': int_or_none(representation_attrib.get('frameRate')),
1454                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1455                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1456                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1457                             'format_note': 'DASH %s' % content_type,
1458                             'filesize': filesize,
1459                         }
1460                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1461                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1462                             if 'total_number' not in representation_ms_info and 'segment_duration':
1463                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1464                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1465                             media_template = representation_ms_info['media_template']
1466                             media_template = media_template.replace('$RepresentationID$', representation_id)
1467                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1468                             media_template.replace('$$', '$')
1469                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1470                         if 'segment_urls' in representation_ms_info:
1471                             f.update({
1472                                 'segment_urls': representation_ms_info['segment_urls'],
1473                                 'protocol': 'http_dash_segments',
1474                             })
1475                             if 'initialization_url' in representation_ms_info:
1476                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1477                                 f.update({
1478                                     'initialization_url': initialization_url,
1479                                 })
1480                                 if not f.get('url'):
1481                                     f['url'] = initialization_url
1482                         try:
1483                             existing_format = next(
1484                                 fo for fo in formats
1485                                 if fo['format_id'] == representation_id)
1486                         except StopIteration:
1487                             full_info = formats_dict.get(representation_id, {}).copy()
1488                             full_info.update(f)
1489                             formats.append(full_info)
1490                         else:
1491                             existing_format.update(f)
1492                     else:
1493                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1494         self._sort_formats(formats)
1495         return formats
1496
1497     def _live_title(self, name):
1498         """ Generate the title for a live video """
1499         now = datetime.datetime.now()
1500         now_str = now.strftime("%Y-%m-%d %H:%M")
1501         return name + ' ' + now_str
1502
1503     def _int(self, v, name, fatal=False, **kwargs):
1504         res = int_or_none(v, **kwargs)
1505         if 'get_attr' in kwargs:
1506             print(getattr(v, kwargs['get_attr']))
1507         if res is None:
1508             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1509             if fatal:
1510                 raise ExtractorError(msg)
1511             else:
1512                 self._downloader.report_warning(msg)
1513         return res
1514
1515     def _float(self, v, name, fatal=False, **kwargs):
1516         res = float_or_none(v, **kwargs)
1517         if res is None:
1518             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1519             if fatal:
1520                 raise ExtractorError(msg)
1521             else:
1522                 self._downloader.report_warning(msg)
1523         return res
1524
1525     def _set_cookie(self, domain, name, value, expire_time=None):
1526         cookie = compat_cookiejar.Cookie(
1527             0, name, value, None, None, domain, None,
1528             None, '/', True, False, expire_time, '', None, None, None)
1529         self._downloader.cookiejar.set_cookie(cookie)
1530
1531     def _get_cookies(self, url):
1532         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1533         req = sanitized_Request(url)
1534         self._downloader.cookiejar.add_cookie_header(req)
1535         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1536
1537     def get_testcases(self, include_onlymatching=False):
1538         t = getattr(self, '_TEST', None)
1539         if t:
1540             assert not hasattr(self, '_TESTS'), \
1541                 '%s has _TEST and _TESTS' % type(self).__name__
1542             tests = [t]
1543         else:
1544             tests = getattr(self, '_TESTS', [])
1545         for t in tests:
1546             if not include_onlymatching and t.get('only_matching', False):
1547                 continue
1548             t['name'] = type(self).__name__[:-len('IE')]
1549             yield t
1550
1551     def is_suitable(self, age_limit):
1552         """ Test whether the extractor is generally suitable for the given
1553         age limit (i.e. pornographic sites are not, all others usually are) """
1554
1555         any_restricted = False
1556         for tc in self.get_testcases(include_onlymatching=False):
1557             if 'playlist' in tc:
1558                 tc = tc['playlist'][0]
1559             is_restricted = age_restricted(
1560                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1561             if not is_restricted:
1562                 return True
1563             any_restricted = any_restricted or is_restricted
1564         return not any_restricted
1565
1566     def extract_subtitles(self, *args, **kwargs):
1567         if (self._downloader.params.get('writesubtitles', False) or
1568                 self._downloader.params.get('listsubtitles')):
1569             return self._get_subtitles(*args, **kwargs)
1570         return {}
1571
1572     def _get_subtitles(self, *args, **kwargs):
1573         raise NotImplementedError("This method must be implemented by subclasses")
1574
1575     @staticmethod
1576     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1577         """ Merge subtitle items for one language. Items with duplicated URLs
1578         will be dropped. """
1579         list1_urls = set([item['url'] for item in subtitle_list1])
1580         ret = list(subtitle_list1)
1581         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1582         return ret
1583
1584     @classmethod
1585     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1586         """ Merge two subtitle dictionaries, language by language. """
1587         ret = dict(subtitle_dict1)
1588         for lang in subtitle_dict2:
1589             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1590         return ret
1591
1592     def extract_automatic_captions(self, *args, **kwargs):
1593         if (self._downloader.params.get('writeautomaticsub', False) or
1594                 self._downloader.params.get('listsubtitles')):
1595             return self._get_automatic_captions(*args, **kwargs)
1596         return {}
1597
1598     def _get_automatic_captions(self, *args, **kwargs):
1599         raise NotImplementedError("This method must be implemented by subclasses")
1600
1601
1602 class SearchInfoExtractor(InfoExtractor):
1603     """
1604     Base class for paged search queries extractors.
1605     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1606     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1607     """
1608
1609     @classmethod
1610     def _make_valid_url(cls):
1611         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1612
1613     @classmethod
1614     def suitable(cls, url):
1615         return re.match(cls._make_valid_url(), url) is not None
1616
1617     def _real_extract(self, query):
1618         mobj = re.match(self._make_valid_url(), query)
1619         if mobj is None:
1620             raise ExtractorError('Invalid search query "%s"' % query)
1621
1622         prefix = mobj.group('prefix')
1623         query = mobj.group('query')
1624         if prefix == '':
1625             return self._get_n_results(query, 1)
1626         elif prefix == 'all':
1627             return self._get_n_results(query, self._MAX_RESULTS)
1628         else:
1629             n = int(prefix)
1630             if n <= 0:
1631                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1632             elif n > self._MAX_RESULTS:
1633                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1634                 n = self._MAX_RESULTS
1635             return self._get_n_results(query, n)
1636
1637     def _get_n_results(self, query, n):
1638         """Get a specified number of results for a query"""
1639         raise NotImplementedError("This method must be implemented by subclasses")
1640
1641     @property
1642     def SEARCH_KEY(self):
1643         return self._SEARCH_KEY