git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     codec2ext,
  33     compiled_regex_type,
  34     determine_ext,
  35     error_to_compat_str,
  36     ExtractorError,
  37     fix_xml_ampersands,
  38     float_or_none,
  39     int_or_none,
  40     parse_iso8601,
  41     RegexNotFoundError,
  42     sanitize_filename,
  43     sanitized_Request,
  44     unescapeHTML,
  45     unified_strdate,
  46     url_basename,
  47     xpath_text,
  48     xpath_with_ns,
  49     determine_protocol,
  50     parse_duration,
  51     mimetype2ext,
  52 )
  53
  54
  55 class InfoExtractor(object):
  56     """Information Extractor class.
  57
  58     Information extractors are the classes that, given a URL, extract
  59     information about the video (or videos) the URL refers to. This
  60     information includes the real video URL, the video title, author and
  61     others. The information is stored in a dictionary which is then
  62     passed to the YoutubeDL. The YoutubeDL processes this
  63     information possibly downloading the video to the file system, among
  64     other possible outcomes.
  65
  66     The type field determines the type of the result.
  67     By far the most common value (and the default if _type is missing) is
  68     "video", which indicates a single video.
  69
  70     For a video, the dictionaries must include the following fields:
  71
  72     id:             Video identifier.
  73     title:          Video title, unescaped.
  74
  75     Additionally, it must contain either a formats entry or a url one:
  76
  77     formats:        A list of dictionaries for each format available, ordered
  78                     from worst to best quality.
  79
  80                     Potential fields:
  81                     * url        Mandatory. The URL of the video file
  82                     * ext        Will be calculated from URL if missing
  83                     * format     A human-readable description of the format
  84                                  ("mp4 container with h264/opus").
  85                                  Calculated from the format_id, width, height.
  86                                  and format_note fields if missing.
  87                     * format_id  A short description of the format
  88                                  ("mp4_h264_opus" or "19").
  89                                 Technically optional, but strongly recommended.
  90                     * format_note Additional info about the format
  91                                  ("3D" or "DASH video")
  92                     * width      Width of the video, if known
  93                     * height     Height of the video, if known
  94                     * resolution Textual description of width and height
  95                     * tbr        Average bitrate of audio and video in KBit/s
  96                     * abr        Average audio bitrate in KBit/s
  97                     * acodec     Name of the audio codec in use
  98                     * asr        Audio sampling rate in Hertz
  99                     * vbr        Average video bitrate in KBit/s
 100                     * fps        Frame rate
 101                     * vcodec     Name of the video codec in use
 102                     * container  Name of the container format
 103                     * filesize   The number of bytes, if known in advance
 104                     * filesize_approx  An estimate for the number of bytes
 105                     * player_url SWF Player URL (used for rtmpdump).
 106                     * protocol   The protocol that will be used for the actual
 107                                  download, lower-case.
 108                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 109                                  "m3u8", "m3u8_native" or "http_dash_segments".
 110                     * preference Order number of this format. If this field is
 111                                  present and not None, the formats get sorted
 112                                  by this field, regardless of all other values.
 113                                  -1 for default (order by other properties),
 114                                  -2 or smaller for less than default.
 115                                  < -1000 to hide the format (if there is
 116                                     another one which is strictly better)
 117                     * language   Language code, e.g. "de" or "en-US".
 118                     * language_preference  Is this in the language mentioned in
 119                                  the URL?
 120                                  10 if it's what the URL is about,
 121                                  -1 for default (don't know),
 122                                  -10 otherwise, other values reserved for now.
 123                     * quality    Order number of the video quality of this
 124                                  format, irrespective of the file format.
 125                                  -1 for default (order by other properties),
 126                                  -2 or smaller for less than default.
 127                     * source_preference  Order number for this video source
 128                                   (quality takes higher priority)
 129                                  -1 for default (order by other properties),
 130                                  -2 or smaller for less than default.
 131                     * http_headers  A dictionary of additional HTTP headers
 132                                  to add to the request.
 133                     * stretched_ratio  If given and not 1, indicates that the
 134                                  video's pixels are not square.
 135                                  width : height ratio as float.
 136                     * no_resume  The server does not support resuming the
 137                                  (HTTP or RTMP) download. Boolean.
 138
 139     url:            Final video URL.
 140     ext:            Video filename extension.
 141     format:         The video format, defaults to ext (used for --get-format)
 142     player_url:     SWF Player URL (used for rtmpdump).
 143
 144     The following fields are optional:
 145
 146     alt_title:      A secondary title of the video.
 147     display_id      An alternative identifier for the video, not necessarily
 148                     unique, but available before title. Typically, id is
 149                     something like "4234987", title "Dancing naked mole rats",
 150                     and display_id "dancing-naked-mole-rats"
 151     thumbnails:     A list of dictionaries, with the following entries:
 152                         * "id" (optional, string) - Thumbnail format ID
 153                         * "url"
 154                         * "preference" (optional, int) - quality of the image
 155                         * "width" (optional, int)
 156                         * "height" (optional, int)
 157                         * "resolution" (optional, string "{width}x{height"},
 158                                         deprecated)
 159     thumbnail:      Full URL to a video thumbnail image.
 160     description:    Full video description.
 161     uploader:       Full name of the video uploader.
 162     license:        License name the video is licensed under.
 163     creator:        The main artist who created the video.
 164     release_date:   The date (YYYYMMDD) when the video was released.
 165     timestamp:      UNIX timestamp of the moment the video became available.
 166     upload_date:    Video upload date (YYYYMMDD).
 167                     If not explicitly set, calculated from timestamp.
 168     uploader_id:    Nickname or id of the video uploader.
 169     uploader_url:   Full URL to a personal webpage of the video uploader.
 170     location:       Physical location where the video was filmed.
 171     subtitles:      The available subtitles as a dictionary in the format
 172                     {language: subformats}. "subformats" is a list sorted from
 173                     lower to higher preference, each element is a dictionary
 174                     with the "ext" entry and one of:
 175                         * "data": The subtitles file contents
 176                         * "url": A URL pointing to the subtitles file
 177                     "ext" will be calculated from URL if missing
 178     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 179                     automatically generated captions
 180     duration:       Length of the video in seconds, as an integer or float.
 181     view_count:     How many users have watched the video on the platform.
 182     like_count:     Number of positive ratings of the video
 183     dislike_count:  Number of negative ratings of the video
 184     repost_count:   Number of reposts of the video
 185     average_rating: Average rating give by users, the scale used depends on the webpage
 186     comment_count:  Number of comments on the video
 187     comments:       A list of comments, each with one or more of the following
 188                     properties (all but one of text or html optional):
 189                         * "author" - human-readable name of the comment author
 190                         * "author_id" - user ID of the comment author
 191                         * "id" - Comment ID
 192                         * "html" - Comment as HTML
 193                         * "text" - Plain text of the comment
 194                         * "timestamp" - UNIX timestamp of comment
 195                         * "parent" - ID of the comment this one is replying to.
 196                                      Set to "root" to indicate that this is a
 197                                      comment to the original video.
 198     age_limit:      Age restriction for the video, as an integer (years)
 199     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 200                     should allow to get the same result again. (It will be set
 201                     by YoutubeDL if it's missing)
 202     categories:     A list of categories that the video falls in, for example
 203                     ["Sports", "Berlin"]
 204     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 205     is_live:        True, False, or None (=unknown). Whether this video is a
 206                     live stream that goes on instead of a fixed-length video.
 207     start_time:     Time in seconds where the reproduction should start, as
 208                     specified in the URL.
 209     end_time:       Time in seconds where the reproduction should end, as
 210                     specified in the URL.
 211
 212     The following fields should only be used when the video belongs to some logical
 213     chapter or section:
 214
 215     chapter:        Name or title of the chapter the video belongs to.
 216     chapter_number: Number of the chapter the video belongs to, as an integer.
 217     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 218
 219     The following fields should only be used when the video is an episode of some
 220     series or programme:
 221
 222     series:         Title of the series or programme the video episode belongs to.
 223     season:         Title of the season the video episode belongs to.
 224     season_number:  Number of the season the video episode belongs to, as an integer.
 225     season_id:      Id of the season the video episode belongs to, as a unicode string.
 226     episode:        Title of the video episode. Unlike mandatory video title field,
 227                     this field should denote the exact title of the video episode
 228                     without any kind of decoration.
 229     episode_number: Number of the video episode within a season, as an integer.
 230     episode_id:     Id of the video episode, as a unicode string.
 231
 232     Unless mentioned otherwise, the fields should be Unicode strings.
 233
 234     Unless mentioned otherwise, None is equivalent to absence of information.
 235
 236
 237     _type "playlist" indicates multiple videos.
 238     There must be a key "entries", which is a list, an iterable, or a PagedList
 239     object, each element of which is a valid dictionary by this specification.
 240
 241     Additionally, playlists can have "title", "description" and "id" attributes
 242     with the same semantics as videos (see above).
 243
 244
 245     _type "multi_video" indicates that there are multiple videos that
 246     form a single show, for examples multiple acts of an opera or TV episode.
 247     It must have an entries key like a playlist and contain all the keys
 248     required for a video at the same time.
 249
 250
 251     _type "url" indicates that the video must be extracted from another
 252     location, possibly by a different extractor. Its only required key is:
 253     "url" - the next URL to extract.
 254     The key "ie_key" can be set to the class name (minus the trailing "IE",
 255     e.g. "Youtube") if the extractor class is known in advance.
 256     Additionally, the dictionary may have any properties of the resolved entity
 257     known in advance, for example "title" if the title of the referred video is
 258     known ahead of time.
 259
 260
 261     _type "url_transparent" entities have the same specification as "url", but
 262     indicate that the given additional information is more precise than the one
 263     associated with the resolved URL.
 264     This is useful when a site employs a video service that hosts the video and
 265     its technical metadata, but that video service does not embed a useful
 266     title, description etc.
 267
 268
 269     Subclasses of this one should re-define the _real_initialize() and
 270     _real_extract() methods and define a _VALID_URL regexp.
 271     Probably, they should also be added to the list of extractors.
 272
 273     Finally, the _WORKING attribute should be set to False for broken IEs
 274     in order to warn the users and skip the tests.
 275     """
 276
 277     _ready = False
 278     _downloader = None
 279     _WORKING = True
 280
 281     def __init__(self, downloader=None):
 282         """Constructor. Receives an optional downloader."""
 283         self._ready = False
 284         self.set_downloader(downloader)
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         """Receives a URL and returns True if suitable for this IE."""
 289
 290         # This does not use has/getattr intentionally - we want to know whether
 291         # we have cached the regexp for *this* class, whereas getattr would also
 292         # match the superclass
 293         if '_VALID_URL_RE' not in cls.__dict__:
 294             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 295         return cls._VALID_URL_RE.match(url) is not None
 296
 297     @classmethod
 298     def _match_id(cls, url):
 299         if '_VALID_URL_RE' not in cls.__dict__:
 300             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 301         m = cls._VALID_URL_RE.match(url)
 302         assert m
 303         return m.group('id')
 304
 305     @classmethod
 306     def working(cls):
 307         """Getter method for _WORKING."""
 308         return cls._WORKING
 309
 310     def initialize(self):
 311         """Initializes an instance (authentication, etc)."""
 312         if not self._ready:
 313             self._real_initialize()
 314             self._ready = True
 315
 316     def extract(self, url):
 317         """Extracts URL information and returns it in list of dicts."""
 318         try:
 319             self.initialize()
 320             return self._real_extract(url)
 321         except ExtractorError:
 322             raise
 323         except compat_http_client.IncompleteRead as e:
 324             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 325         except (KeyError, StopIteration) as e:
 326             raise ExtractorError('An extractor error has occurred.', cause=e)
 327
 328     def set_downloader(self, downloader):
 329         """Sets the downloader for this IE."""
 330         self._downloader = downloader
 331
 332     def _real_initialize(self):
 333         """Real initialization process. Redefine in subclasses."""
 334         pass
 335
 336     def _real_extract(self, url):
 337         """Real extraction process. Redefine in subclasses."""
 338         pass
 339
 340     @classmethod
 341     def ie_key(cls):
 342         """A string for getting the InfoExtractor with get_info_extractor"""
 343         return compat_str(cls.__name__[:-2])
 344
 345     @property
 346     def IE_NAME(self):
 347         return compat_str(type(self).__name__[:-2])
 348
 349     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 350         """ Returns the response handle """
 351         if note is None:
 352             self.report_download_webpage(video_id)
 353         elif note is not False:
 354             if video_id is None:
 355                 self.to_screen('%s' % (note,))
 356             else:
 357                 self.to_screen('%s: %s' % (video_id, note))
 358         try:
 359             return self._downloader.urlopen(url_or_request)
 360         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 361             if errnote is False:
 362                 return False
 363             if errnote is None:
 364                 errnote = 'Unable to download webpage'
 365
 366             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 367             if fatal:
 368                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 369             else:
 370                 self._downloader.report_warning(errmsg)
 371                 return False
 372
 373     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 374         """ Returns a tuple (page content as string, URL handle) """
 375         # Strip hashes from the URL (#1038)
 376         if isinstance(url_or_request, (compat_str, str)):
 377             url_or_request = url_or_request.partition('#')[0]
 378
 379         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 380         if urlh is False:
 381             assert not fatal
 382             return False
 383         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 384         return (content, urlh)
 385
 386     @staticmethod
 387     def _guess_encoding_from_content(content_type, webpage_bytes):
 388         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 389         if m:
 390             encoding = m.group(1)
 391         else:
 392             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 393                           webpage_bytes[:1024])
 394             if m:
 395                 encoding = m.group(1).decode('ascii')
 396             elif webpage_bytes.startswith(b'\xff\xfe'):
 397                 encoding = 'utf-16'
 398             else:
 399                 encoding = 'utf-8'
 400
 401         return encoding
 402
 403     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 404         content_type = urlh.headers.get('Content-Type', '')
 405         webpage_bytes = urlh.read()
 406         if prefix is not None:
 407             webpage_bytes = prefix + webpage_bytes
 408         if not encoding:
 409             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 410         if self._downloader.params.get('dump_intermediate_pages', False):
 411             try:
 412                 url = url_or_request.get_full_url()
 413             except AttributeError:
 414                 url = url_or_request
 415             self.to_screen('Dumping request to ' + url)
 416             dump = base64.b64encode(webpage_bytes).decode('ascii')
 417             self._downloader.to_screen(dump)
 418         if self._downloader.params.get('write_pages', False):
 419             try:
 420                 url = url_or_request.get_full_url()
 421             except AttributeError:
 422                 url = url_or_request
 423             basen = '%s_%s' % (video_id, url)
 424             if len(basen) > 240:
 425                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 426                 basen = basen[:240 - len(h)] + h
 427             raw_filename = basen + '.dump'
 428             filename = sanitize_filename(raw_filename, restricted=True)
 429             self.to_screen('Saving request to ' + filename)
 430             # Working around MAX_PATH limitation on Windows (see
 431             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 432             if compat_os_name == 'nt':
 433                 absfilepath = os.path.abspath(filename)
 434                 if len(absfilepath) > 259:
 435                     filename = '\\\\?\\' + absfilepath
 436             with open(filename, 'wb') as outf:
 437                 outf.write(webpage_bytes)
 438
 439         try:
 440             content = webpage_bytes.decode(encoding, 'replace')
 441         except LookupError:
 442             content = webpage_bytes.decode('utf-8', 'replace')
 443
 444         if ('<title>Access to this site is blocked</title>' in content and
 445                 'Websense' in content[:512]):
 446             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 447             blocked_iframe = self._html_search_regex(
 448                 r'<iframe src="([^"]+)"', content,
 449                 'Websense information URL', default=None)
 450             if blocked_iframe:
 451                 msg += ' Visit %s for more details' % blocked_iframe
 452             raise ExtractorError(msg, expected=True)
 453         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 454             msg = (
 455                 'Access to this webpage has been blocked by Indian censorship. '
 456                 'Use a VPN or proxy server (with --proxy) to route around it.')
 457             block_msg = self._html_search_regex(
 458                 r'</h1><p>(.*?)</p>',
 459                 content, 'block message', default=None)
 460             if block_msg:
 461                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 462             raise ExtractorError(msg, expected=True)
 463
 464         return content
 465
 466     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 467         """ Returns the data of the page as a string """
 468         success = False
 469         try_count = 0
 470         while success is False:
 471             try:
 472                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 473                 success = True
 474             except compat_http_client.IncompleteRead as e:
 475                 try_count += 1
 476                 if try_count >= tries:
 477                     raise e
 478                 self._sleep(timeout, video_id)
 479         if res is False:
 480             return res
 481         else:
 482             content, _ = res
 483             return content
 484
 485     def _download_xml(self, url_or_request, video_id,
 486                       note='Downloading XML', errnote='Unable to download XML',
 487                       transform_source=None, fatal=True, encoding=None):
 488         """Return the xml as an xml.etree.ElementTree.Element"""
 489         xml_string = self._download_webpage(
 490             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 491         if xml_string is False:
 492             return xml_string
 493         if transform_source:
 494             xml_string = transform_source(xml_string)
 495         return compat_etree_fromstring(xml_string.encode('utf-8'))
 496
 497     def _download_json(self, url_or_request, video_id,
 498                        note='Downloading JSON metadata',
 499                        errnote='Unable to download JSON metadata',
 500                        transform_source=None,
 501                        fatal=True, encoding=None):
 502         json_string = self._download_webpage(
 503             url_or_request, video_id, note, errnote, fatal=fatal,
 504             encoding=encoding)
 505         if (not fatal) and json_string is False:
 506             return None
 507         return self._parse_json(
 508             json_string, video_id, transform_source=transform_source, fatal=fatal)
 509
 510     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 511         if transform_source:
 512             json_string = transform_source(json_string)
 513         try:
 514             return json.loads(json_string)
 515         except ValueError as ve:
 516             errmsg = '%s: Failed to parse JSON ' % video_id
 517             if fatal:
 518                 raise ExtractorError(errmsg, cause=ve)
 519             else:
 520                 self.report_warning(errmsg + str(ve))
 521
 522     def report_warning(self, msg, video_id=None):
 523         idstr = '' if video_id is None else '%s: ' % video_id
 524         self._downloader.report_warning(
 525             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 526
 527     def to_screen(self, msg):
 528         """Print msg to screen, prefixing it with '[ie_name]'"""
 529         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 530
 531     def report_extraction(self, id_or_name):
 532         """Report information extraction."""
 533         self.to_screen('%s: Extracting information' % id_or_name)
 534
 535     def report_download_webpage(self, video_id):
 536         """Report webpage download."""
 537         self.to_screen('%s: Downloading webpage' % video_id)
 538
 539     def report_age_confirmation(self):
 540         """Report attempt to confirm age."""
 541         self.to_screen('Confirming age')
 542
 543     def report_login(self):
 544         """Report attempt to log in."""
 545         self.to_screen('Logging in')
 546
 547     @staticmethod
 548     def raise_login_required(msg='This video is only available for registered users'):
 549         raise ExtractorError(
 550             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 551             expected=True)
 552
 553     @staticmethod
 554     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 555         raise ExtractorError(
 556             '%s. You might want to use --proxy to workaround.' % msg,
 557             expected=True)
 558
 559     # Methods for following #608
 560     @staticmethod
 561     def url_result(url, ie=None, video_id=None, video_title=None):
 562         """Returns a URL that points to a page that should be processed"""
 563         # TODO: ie should be the class used for getting the info
 564         video_info = {'_type': 'url',
 565                       'url': url,
 566                       'ie_key': ie}
 567         if video_id is not None:
 568             video_info['id'] = video_id
 569         if video_title is not None:
 570             video_info['title'] = video_title
 571         return video_info
 572
 573     @staticmethod
 574     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 575         """Returns a playlist"""
 576         video_info = {'_type': 'playlist',
 577                       'entries': entries}
 578         if playlist_id:
 579             video_info['id'] = playlist_id
 580         if playlist_title:
 581             video_info['title'] = playlist_title
 582         if playlist_description:
 583             video_info['description'] = playlist_description
 584         return video_info
 585
 586     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 587         """
 588         Perform a regex search on the given string, using a single or a list of
 589         patterns returning the first matching group.
 590         In case of failure return a default value or raise a WARNING or a
 591         RegexNotFoundError, depending on fatal, specifying the field name.
 592         """
 593         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 594             mobj = re.search(pattern, string, flags)
 595         else:
 596             for p in pattern:
 597                 mobj = re.search(p, string, flags)
 598                 if mobj:
 599                     break
 600
 601         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 602             _name = '\033[0;34m%s\033[0m' % name
 603         else:
 604             _name = name
 605
 606         if mobj:
 607             if group is None:
 608                 # return the first matching group
 609                 return next(g for g in mobj.groups() if g is not None)
 610             else:
 611                 return mobj.group(group)
 612         elif default is not NO_DEFAULT:
 613             return default
 614         elif fatal:
 615             raise RegexNotFoundError('Unable to extract %s' % _name)
 616         else:
 617             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 618             return None
 619
 620     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 621         """
 622         Like _search_regex, but strips HTML tags and unescapes entities.
 623         """
 624         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 625         if res:
 626             return clean_html(res).strip()
 627         else:
 628             return res
 629
 630     def _get_login_info(self):
 631         """
 632         Get the login info as (username, password)
 633         It will look in the netrc file using the _NETRC_MACHINE value
 634         If there's no info available, return (None, None)
 635         """
 636         if self._downloader is None:
 637             return (None, None)
 638
 639         username = None
 640         password = None
 641         downloader_params = self._downloader.params
 642
 643         # Attempt to use provided username and password or .netrc data
 644         if downloader_params.get('username') is not None:
 645             username = downloader_params['username']
 646             password = downloader_params['password']
 647         elif downloader_params.get('usenetrc', False):
 648             try:
 649                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 650                 if info is not None:
 651                     username = info[0]
 652                     password = info[2]
 653                 else:
 654                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 655             except (IOError, netrc.NetrcParseError) as err:
 656                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 657
 658         return (username, password)
 659
 660     def _get_tfa_info(self, note='two-factor verification code'):
 661         """
 662         Get the two-factor authentication info
 663         TODO - asking the user will be required for sms/phone verify
 664         currently just uses the command line option
 665         If there's no info available, return None
 666         """
 667         if self._downloader is None:
 668             return None
 669         downloader_params = self._downloader.params
 670
 671         if downloader_params.get('twofactor') is not None:
 672             return downloader_params['twofactor']
 673
 674         return compat_getpass('Type %s and press [Return]: ' % note)
 675
 676     # Helper functions for extracting OpenGraph info
 677     @staticmethod
 678     def _og_regexes(prop):
 679         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 680         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 681                        % {'prop': re.escape(prop)})
 682         template = r'<meta[^>]+?%s[^>]+?%s'
 683         return [
 684             template % (property_re, content_re),
 685             template % (content_re, property_re),
 686         ]
 687
 688     @staticmethod
 689     def _meta_regex(prop):
 690         return r'''(?isx)<meta
 691                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 692                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 693
 694     def _og_search_property(self, prop, html, name=None, **kargs):
 695         if name is None:
 696             name = 'OpenGraph %s' % prop
 697         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 698         if escaped is None:
 699             return None
 700         return unescapeHTML(escaped)
 701
 702     def _og_search_thumbnail(self, html, **kargs):
 703         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 704
 705     def _og_search_description(self, html, **kargs):
 706         return self._og_search_property('description', html, fatal=False, **kargs)
 707
 708     def _og_search_title(self, html, **kargs):
 709         return self._og_search_property('title', html, **kargs)
 710
 711     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 712         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 713         if secure:
 714             regexes = self._og_regexes('video:secure_url') + regexes
 715         return self._html_search_regex(regexes, html, name, **kargs)
 716
 717     def _og_search_url(self, html, **kargs):
 718         return self._og_search_property('url', html, **kargs)
 719
 720     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 721         if display_name is None:
 722             display_name = name
 723         return self._html_search_regex(
 724             self._meta_regex(name),
 725             html, display_name, fatal=fatal, group='content', **kwargs)
 726
 727     def _dc_search_uploader(self, html):
 728         return self._html_search_meta('dc.creator', html, 'uploader')
 729
 730     def _rta_search(self, html):
 731         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 732         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 733                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 734                      html):
 735             return 18
 736         return 0
 737
 738     def _media_rating_search(self, html):
 739         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 740         rating = self._html_search_meta('rating', html)
 741
 742         if not rating:
 743             return None
 744
 745         RATING_TABLE = {
 746             'safe for kids': 0,
 747             'general': 8,
 748             '14 years': 14,
 749             'mature': 17,
 750             'restricted': 19,
 751         }
 752         return RATING_TABLE.get(rating.lower())
 753
 754     def _family_friendly_search(self, html):
 755         # See http://schema.org/VideoObject
 756         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 757
 758         if not family_friendly:
 759             return None
 760
 761         RATING_TABLE = {
 762             '1': 0,
 763             'true': 0,
 764             '0': 18,
 765             'false': 18,
 766         }
 767         return RATING_TABLE.get(family_friendly.lower())
 768
 769     def _twitter_search_player(self, html):
 770         return self._html_search_meta('twitter:player', html,
 771                                       'twitter card player')
 772
 773     def _search_json_ld(self, html, video_id, **kwargs):
 774         json_ld = self._search_regex(
 775             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 776             html, 'JSON-LD', group='json_ld', **kwargs)
 777         if not json_ld:
 778             return {}
 779         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 780
 781     def _json_ld(self, json_ld, video_id, fatal=True):
 782         if isinstance(json_ld, compat_str):
 783             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 784         if not json_ld:
 785             return {}
 786         info = {}
 787         if json_ld.get('@context') == 'http://schema.org':
 788             item_type = json_ld.get('@type')
 789             if item_type == 'TVEpisode':
 790                 info.update({
 791                     'episode': unescapeHTML(json_ld.get('name')),
 792                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 793                     'description': unescapeHTML(json_ld.get('description')),
 794                 })
 795                 part_of_season = json_ld.get('partOfSeason')
 796                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 797                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 798                 part_of_series = json_ld.get('partOfSeries')
 799                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 800                     info['series'] = unescapeHTML(part_of_series.get('name'))
 801             elif item_type == 'Article':
 802                 info.update({
 803                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 804                     'title': unescapeHTML(json_ld.get('headline')),
 805                     'description': unescapeHTML(json_ld.get('articleBody')),
 806                 })
 807         return dict((k, v) for k, v in info.items() if v is not None)
 808
 809     @staticmethod
 810     def _hidden_inputs(html):
 811         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 812         hidden_inputs = {}
 813         for input in re.findall(r'(?i)<input([^>]+)>', html):
 814             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 815                 continue
 816             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 817             if not name:
 818                 continue
 819             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 820             if not value:
 821                 continue
 822             hidden_inputs[name.group('value')] = value.group('value')
 823         return hidden_inputs
 824
 825     def _form_hidden_inputs(self, form_id, html):
 826         form = self._search_regex(
 827             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 828             html, '%s form' % form_id, group='form')
 829         return self._hidden_inputs(form)
 830
 831     def _sort_formats(self, formats, field_preference=None):
 832         if not formats:
 833             raise ExtractorError('No video formats found')
 834
 835         for f in formats:
 836             # Automatically determine tbr when missing based on abr and vbr (improves
 837             # formats sorting in some cases)
 838             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 839                 f['tbr'] = f['abr'] + f['vbr']
 840
 841         def _formats_key(f):
 842             # TODO remove the following workaround
 843             from ..utils import determine_ext
 844             if not f.get('ext') and 'url' in f:
 845                 f['ext'] = determine_ext(f['url'])
 846
 847             if isinstance(field_preference, (list, tuple)):
 848                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 849
 850             preference = f.get('preference')
 851             if preference is None:
 852                 preference = 0
 853                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 854                     preference -= 0.5
 855
 856             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 857
 858             if f.get('vcodec') == 'none':  # audio only
 859                 if self._downloader.params.get('prefer_free_formats'):
 860                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 861                 else:
 862                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 863                 ext_preference = 0
 864                 try:
 865                     audio_ext_preference = ORDER.index(f['ext'])
 866                 except ValueError:
 867                     audio_ext_preference = -1
 868             else:
 869                 if self._downloader.params.get('prefer_free_formats'):
 870                     ORDER = ['flv', 'mp4', 'webm']
 871                 else:
 872                     ORDER = ['webm', 'flv', 'mp4']
 873                 try:
 874                     ext_preference = ORDER.index(f['ext'])
 875                 except ValueError:
 876                     ext_preference = -1
 877                 audio_ext_preference = 0
 878
 879             return (
 880                 preference,
 881                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 882                 f.get('quality') if f.get('quality') is not None else -1,
 883                 f.get('tbr') if f.get('tbr') is not None else -1,
 884                 f.get('filesize') if f.get('filesize') is not None else -1,
 885                 f.get('vbr') if f.get('vbr') is not None else -1,
 886                 f.get('height') if f.get('height') is not None else -1,
 887                 f.get('width') if f.get('width') is not None else -1,
 888                 proto_preference,
 889                 ext_preference,
 890                 f.get('abr') if f.get('abr') is not None else -1,
 891                 audio_ext_preference,
 892                 f.get('fps') if f.get('fps') is not None else -1,
 893                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 894                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 895                 f.get('format_id') if f.get('format_id') is not None else '',
 896             )
 897         formats.sort(key=_formats_key)
 898
 899     def _check_formats(self, formats, video_id):
 900         if formats:
 901             formats[:] = filter(
 902                 lambda f: self._is_valid_url(
 903                     f['url'], video_id,
 904                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 905                 formats)
 906
 907     @staticmethod
 908     def _remove_duplicate_formats(formats):
 909         format_urls = set()
 910         unique_formats = []
 911         for f in formats:
 912             if f['url'] not in format_urls:
 913                 format_urls.add(f['url'])
 914                 unique_formats.append(f)
 915         formats[:] = unique_formats
 916
 917     def _is_valid_url(self, url, video_id, item='video'):
 918         url = self._proto_relative_url(url, scheme='http:')
 919         # For now assume non HTTP(S) URLs always valid
 920         if not (url.startswith('http://') or url.startswith('https://')):
 921             return True
 922         try:
 923             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 924             return True
 925         except ExtractorError as e:
 926             if isinstance(e.cause, compat_urllib_error.URLError):
 927                 self.to_screen(
 928                     '%s: %s URL is invalid, skipping' % (video_id, item))
 929                 return False
 930             raise
 931
 932     def http_scheme(self):
 933         """ Either "http:" or "https:", depending on the user's preferences """
 934         return (
 935             'http:'
 936             if self._downloader.params.get('prefer_insecure', False)
 937             else 'https:')
 938
 939     def _proto_relative_url(self, url, scheme=None):
 940         if url is None:
 941             return url
 942         if url.startswith('//'):
 943             if scheme is None:
 944                 scheme = self.http_scheme()
 945             return scheme + url
 946         else:
 947             return url
 948
 949     def _sleep(self, timeout, video_id, msg_template=None):
 950         if msg_template is None:
 951             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 952         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 953         self.to_screen(msg)
 954         time.sleep(timeout)
 955
 956     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 957                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 958                              fatal=True):
 959         manifest = self._download_xml(
 960             manifest_url, video_id, 'Downloading f4m manifest',
 961             'Unable to download f4m manifest',
 962             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 963             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 964             transform_source=transform_source,
 965             fatal=fatal)
 966
 967         if manifest is False:
 968             return []
 969
 970         formats = []
 971         manifest_version = '1.0'
 972         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 973         if not media_nodes:
 974             manifest_version = '2.0'
 975             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 976         base_url = xpath_text(
 977             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 978             'base URL', default=None)
 979         if base_url:
 980             base_url = base_url.strip()
 981         for i, media_el in enumerate(media_nodes):
 982             if manifest_version == '2.0':
 983                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 984                 if not media_url:
 985                     continue
 986                 manifest_url = (
 987                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 988                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 989                 # If media_url is itself a f4m manifest do the recursive extraction
 990                 # since bitrates in parent manifest (this one) and media_url manifest
 991                 # may differ leading to inability to resolve the format by requested
 992                 # bitrate in f4m downloader
 993                 if determine_ext(manifest_url) == 'f4m':
 994                     formats.extend(self._extract_f4m_formats(
 995                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 996                     continue
 997             tbr = int_or_none(media_el.attrib.get('bitrate'))
 998             formats.append({
 999                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1000                 'url': manifest_url,
1001                 'ext': 'flv',
1002                 'tbr': tbr,
1003                 'width': int_or_none(media_el.attrib.get('width')),
1004                 'height': int_or_none(media_el.attrib.get('height')),
1005                 'preference': preference,
1006             })
1007         self._sort_formats(formats)
1008
1009         return formats
1010
1011     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1012                               entry_protocol='m3u8', preference=None,
1013                               m3u8_id=None, note=None, errnote=None,
1014                               fatal=True):
1015
1016         formats = [{
1017             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1018             'url': m3u8_url,
1019             'ext': ext,
1020             'protocol': 'm3u8',
1021             'preference': preference - 1 if preference else -1,
1022             'resolution': 'multiple',
1023             'format_note': 'Quality selection URL',
1024         }]
1025
1026         format_url = lambda u: (
1027             u
1028             if re.match(r'^https?://', u)
1029             else compat_urlparse.urljoin(m3u8_url, u))
1030
1031         res = self._download_webpage_handle(
1032             m3u8_url, video_id,
1033             note=note or 'Downloading m3u8 information',
1034             errnote=errnote or 'Failed to download m3u8 information',
1035             fatal=fatal)
1036         if res is False:
1037             return []
1038         m3u8_doc, urlh = res
1039         m3u8_url = urlh.geturl()
1040
1041         # We should try extracting formats only from master playlists [1], i.e.
1042         # playlists that describe available qualities. On the other hand media
1043         # playlists [2] should be returned as is since they contain just the media
1044         # without qualities renditions.
1045         # Fortunately, master playlist can be easily distinguished from media
1046         # playlist based on particular tags availability. As of [1, 2] master
1047         # playlist tags MUST NOT appear in a media playist and vice versa.
1048         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1049         # and MUST NOT appear in master playlist thus we can clearly detect media
1050         # playlist with this criterion.
1051         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1052         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1053         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1054         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1055             return [{
1056                 'url': m3u8_url,
1057                 'format_id': m3u8_id,
1058                 'ext': ext,
1059                 'protocol': entry_protocol,
1060                 'preference': preference,
1061             }]
1062         last_info = None
1063         last_media = None
1064         kv_rex = re.compile(
1065             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1066         for line in m3u8_doc.splitlines():
1067             if line.startswith('#EXT-X-STREAM-INF:'):
1068                 last_info = {}
1069                 for m in kv_rex.finditer(line):
1070                     v = m.group('val')
1071                     if v.startswith('"'):
1072                         v = v[1:-1]
1073                     last_info[m.group('key')] = v
1074             elif line.startswith('#EXT-X-MEDIA:'):
1075                 last_media = {}
1076                 for m in kv_rex.finditer(line):
1077                     v = m.group('val')
1078                     if v.startswith('"'):
1079                         v = v[1:-1]
1080                     last_media[m.group('key')] = v
1081             elif line.startswith('#') or not line.strip():
1082                 continue
1083             else:
1084                 if last_info is None:
1085                     formats.append({'url': format_url(line)})
1086                     continue
1087                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1088                 format_id = []
1089                 if m3u8_id:
1090                     format_id.append(m3u8_id)
1091                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1092                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1093                 f = {
1094                     'format_id': '-'.join(format_id),
1095                     'url': format_url(line.strip()),
1096                     'tbr': tbr,
1097                     'ext': ext,
1098                     'protocol': entry_protocol,
1099                     'preference': preference,
1100                 }
1101                 resolution = last_info.get('RESOLUTION')
1102                 if resolution:
1103                     width_str, height_str = resolution.split('x')
1104                     f['width'] = int(width_str)
1105                     f['height'] = int(height_str)
1106                 codecs = last_info.get('CODECS')
1107                 if codecs:
1108                     vcodec, acodec = [None] * 2
1109                     va_codecs = codecs.split(',')
1110                     if len(va_codecs) == 1:
1111                         # Audio only entries usually come with single codec and
1112                         # no resolution. For more robustness we also check it to
1113                         # be mp4 audio.
1114                         if not resolution and va_codecs[0].startswith('mp4a'):
1115                             vcodec, acodec = 'none', va_codecs[0]
1116                         else:
1117                             vcodec = va_codecs[0]
1118                     else:
1119                         vcodec, acodec = va_codecs[:2]
1120                     f.update({
1121                         'acodec': acodec,
1122                         'vcodec': vcodec,
1123                     })
1124                 if last_media is not None:
1125                     f['m3u8_media'] = last_media
1126                     last_media = None
1127                 formats.append(f)
1128                 last_info = {}
1129         self._sort_formats(formats)
1130         return formats
1131
1132     @staticmethod
1133     def _xpath_ns(path, namespace=None):
1134         if not namespace:
1135             return path
1136         out = []
1137         for c in path.split('/'):
1138             if not c or c == '.':
1139                 out.append(c)
1140             else:
1141                 out.append('{%s}%s' % (namespace, c))
1142         return '/'.join(out)
1143
1144     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1145         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1146
1147         if smil is False:
1148             assert not fatal
1149             return []
1150
1151         namespace = self._parse_smil_namespace(smil)
1152
1153         return self._parse_smil_formats(
1154             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1155
1156     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1157         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1158         if smil is False:
1159             return {}
1160         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1161
1162     def _download_smil(self, smil_url, video_id, fatal=True):
1163         return self._download_xml(
1164             smil_url, video_id, 'Downloading SMIL file',
1165             'Unable to download SMIL file', fatal=fatal)
1166
1167     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1168         namespace = self._parse_smil_namespace(smil)
1169
1170         formats = self._parse_smil_formats(
1171             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1172         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1173
1174         video_id = os.path.splitext(url_basename(smil_url))[0]
1175         title = None
1176         description = None
1177         upload_date = None
1178         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179             name = meta.attrib.get('name')
1180             content = meta.attrib.get('content')
1181             if not name or not content:
1182                 continue
1183             if not title and name == 'title':
1184                 title = content
1185             elif not description and name in ('description', 'abstract'):
1186                 description = content
1187             elif not upload_date and name == 'date':
1188                 upload_date = unified_strdate(content)
1189
1190         thumbnails = [{
1191             'id': image.get('type'),
1192             'url': image.get('src'),
1193             'width': int_or_none(image.get('width')),
1194             'height': int_or_none(image.get('height')),
1195         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1196
1197         return {
1198             'id': video_id,
1199             'title': title or video_id,
1200             'description': description,
1201             'upload_date': upload_date,
1202             'thumbnails': thumbnails,
1203             'formats': formats,
1204             'subtitles': subtitles,
1205         }
1206
1207     def _parse_smil_namespace(self, smil):
1208         return self._search_regex(
1209             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1210
1211     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1212         base = smil_url
1213         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1214             b = meta.get('base') or meta.get('httpBase')
1215             if b:
1216                 base = b
1217                 break
1218
1219         formats = []
1220         rtmp_count = 0
1221         http_count = 0
1222         m3u8_count = 0
1223
1224         srcs = []
1225         videos = smil.findall(self._xpath_ns('.//video', namespace))
1226         for video in videos:
1227             src = video.get('src')
1228             if not src or src in srcs:
1229                 continue
1230             srcs.append(src)
1231
1232             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1233             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1234             width = int_or_none(video.get('width'))
1235             height = int_or_none(video.get('height'))
1236             proto = video.get('proto')
1237             ext = video.get('ext')
1238             src_ext = determine_ext(src)
1239             streamer = video.get('streamer') or base
1240
1241             if proto == 'rtmp' or streamer.startswith('rtmp'):
1242                 rtmp_count += 1
1243                 formats.append({
1244                     'url': streamer,
1245                     'play_path': src,
1246                     'ext': 'flv',
1247                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1248                     'tbr': bitrate,
1249                     'filesize': filesize,
1250                     'width': width,
1251                     'height': height,
1252                 })
1253                 if transform_rtmp_url:
1254                     streamer, src = transform_rtmp_url(streamer, src)
1255                     formats[-1].update({
1256                         'url': streamer,
1257                         'play_path': src,
1258                     })
1259                 continue
1260
1261             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1262             src_url = src_url.strip()
1263
1264             if proto == 'm3u8' or src_ext == 'm3u8':
1265                 m3u8_formats = self._extract_m3u8_formats(
1266                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1267                 if len(m3u8_formats) == 1:
1268                     m3u8_count += 1
1269                     m3u8_formats[0].update({
1270                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1271                         'tbr': bitrate,
1272                         'width': width,
1273                         'height': height,
1274                     })
1275                 formats.extend(m3u8_formats)
1276                 continue
1277
1278             if src_ext == 'f4m':
1279                 f4m_url = src_url
1280                 if not f4m_params:
1281                     f4m_params = {
1282                         'hdcore': '3.2.0',
1283                         'plugin': 'flowplayer-3.2.0.1',
1284                     }
1285                 f4m_url += '&' if '?' in f4m_url else '?'
1286                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1287                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1288                 continue
1289
1290             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1291                 http_count += 1
1292                 formats.append({
1293                     'url': src_url,
1294                     'ext': ext or src_ext or 'flv',
1295                     'format_id': 'http-%d' % (bitrate or http_count),
1296                     'tbr': bitrate,
1297                     'filesize': filesize,
1298                     'width': width,
1299                     'height': height,
1300                 })
1301                 continue
1302
1303         self._sort_formats(formats)
1304
1305         return formats
1306
1307     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1308         urls = []
1309         subtitles = {}
1310         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1311             src = textstream.get('src')
1312             if not src or src in urls:
1313                 continue
1314             urls.append(src)
1315             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1316             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1317             subtitles.setdefault(lang, []).append({
1318                 'url': src,
1319                 'ext': ext,
1320             })
1321         return subtitles
1322
1323     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1324         xspf = self._download_xml(
1325             playlist_url, playlist_id, 'Downloading xpsf playlist',
1326             'Unable to download xspf manifest', fatal=fatal)
1327         if xspf is False:
1328             return []
1329         return self._parse_xspf(xspf, playlist_id)
1330
1331     def _parse_xspf(self, playlist, playlist_id):
1332         NS_MAP = {
1333             'xspf': 'http://xspf.org/ns/0/',
1334             's1': 'http://static.streamone.nl/player/ns/0',
1335         }
1336
1337         entries = []
1338         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1339             title = xpath_text(
1340                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1341             description = xpath_text(
1342                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1343             thumbnail = xpath_text(
1344                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1345             duration = float_or_none(
1346                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1347
1348             formats = [{
1349                 'url': location.text,
1350                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1351                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1352                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1353             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1354             self._sort_formats(formats)
1355
1356             entries.append({
1357                 'id': playlist_id,
1358                 'title': title,
1359                 'description': description,
1360                 'thumbnail': thumbnail,
1361                 'duration': duration,
1362                 'formats': formats,
1363             })
1364         return entries
1365
1366     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1367         res = self._download_webpage_handle(
1368             mpd_url, video_id,
1369             note=note or 'Downloading MPD manifest',
1370             errnote=errnote or 'Failed to download MPD manifest',
1371             fatal=fatal)
1372         if res is False:
1373             return []
1374         mpd, urlh = res
1375         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1376
1377         return self._parse_mpd_formats(
1378             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1379
1380     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1381         if mpd_doc.get('type') == 'dynamic':
1382             return []
1383
1384         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1385
1386         def _add_ns(path):
1387             return self._xpath_ns(path, namespace)
1388
1389         def is_drm_protected(element):
1390             return element.find(_add_ns('ContentProtection')) is not None
1391
1392         def extract_multisegment_info(element, ms_parent_info):
1393             ms_info = ms_parent_info.copy()
1394             segment_list = element.find(_add_ns('SegmentList'))
1395             if segment_list is not None:
1396                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1397                 if segment_urls_e:
1398                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1399                 initialization = segment_list.find(_add_ns('Initialization'))
1400                 if initialization is not None:
1401                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1402             else:
1403                 segment_template = element.find(_add_ns('SegmentTemplate'))
1404                 if segment_template is not None:
1405                     start_number = segment_template.get('startNumber')
1406                     if start_number:
1407                         ms_info['start_number'] = int(start_number)
1408                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1409                     if segment_timeline is not None:
1410                         s_e = segment_timeline.findall(_add_ns('S'))
1411                         if s_e:
1412                             ms_info['total_number'] = 0
1413                             for s in s_e:
1414                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1415                     else:
1416                         timescale = segment_template.get('timescale')
1417                         if timescale:
1418                             ms_info['timescale'] = int(timescale)
1419                         segment_duration = segment_template.get('duration')
1420                         if segment_duration:
1421                             ms_info['segment_duration'] = int(segment_duration)
1422                     media_template = segment_template.get('media')
1423                     if media_template:
1424                         ms_info['media_template'] = media_template
1425                     initialization = segment_template.get('initialization')
1426                     if initialization:
1427                         ms_info['initialization_url'] = initialization
1428                     else:
1429                         initialization = segment_template.find(_add_ns('Initialization'))
1430                         if initialization is not None:
1431                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1432             return ms_info
1433
1434         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1435         formats = []
1436         for period in mpd_doc.findall(_add_ns('Period')):
1437             period_duration = parse_duration(period.get('duration')) or mpd_duration
1438             period_ms_info = extract_multisegment_info(period, {
1439                 'start_number': 1,
1440                 'timescale': 1,
1441             })
1442             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1443                 if is_drm_protected(adaptation_set):
1444                     continue
1445                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1446                 for representation in adaptation_set.findall(_add_ns('Representation')):
1447                     if is_drm_protected(representation):
1448                         continue
1449                     representation_attrib = adaptation_set.attrib.copy()
1450                     representation_attrib.update(representation.attrib)
1451                     mime_type = representation_attrib.get('mimeType')
1452                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1453                     if content_type == 'text':
1454                         # TODO implement WebVTT downloading
1455                         pass
1456                     elif content_type == 'video' or content_type == 'audio':
1457                         base_url = ''
1458                         for element in (representation, adaptation_set, period, mpd_doc):
1459                             base_url_e = element.find(_add_ns('BaseURL'))
1460                             if base_url_e is not None:
1461                                 base_url = base_url_e.text + base_url
1462                                 if re.match(r'^https?://', base_url):
1463                                     break
1464                         if mpd_base_url and not re.match(r'^https?://', base_url):
1465                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1466                                 mpd_base_url += '/'
1467                             base_url = mpd_base_url + base_url
1468                         representation_id = representation_attrib.get('id')
1469                         lang = representation_attrib.get('lang')
1470                         url_el = representation.find(_add_ns('BaseURL'))
1471                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1472                         f = {
1473                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1474                             'url': base_url,
1475                             'ext': codec2ext(representation_attrib.get('codecs')),
1476                             'width': int_or_none(representation_attrib.get('width')),
1477                             'height': int_or_none(representation_attrib.get('height')),
1478                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1479                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1480                             'fps': int_or_none(representation_attrib.get('frameRate')),
1481                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1482                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1483                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1484                             'format_note': 'DASH %s' % content_type,
1485                             'filesize': filesize,
1486                         }
1487                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1488                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1489                             if 'total_number' not in representation_ms_info and 'segment_duration':
1490                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1491                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1492                             media_template = representation_ms_info['media_template']
1493                             media_template = media_template.replace('$RepresentationID$', representation_id)
1494                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1495                             media_template.replace('$$', '$')
1496                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1497                         if 'segment_urls' in representation_ms_info:
1498                             f.update({
1499                                 'segment_urls': representation_ms_info['segment_urls'],
1500                                 'protocol': 'http_dash_segments',
1501                             })
1502                             if 'initialization_url' in representation_ms_info:
1503                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1504                                 f.update({
1505                                     'initialization_url': initialization_url,
1506                                 })
1507                                 if not f.get('url'):
1508                                     f['url'] = initialization_url
1509                         try:
1510                             existing_format = next(
1511                                 fo for fo in formats
1512                                 if fo['format_id'] == representation_id)
1513                         except StopIteration:
1514                             full_info = formats_dict.get(representation_id, {}).copy()
1515                             full_info.update(f)
1516                             formats.append(full_info)
1517                         else:
1518                             existing_format.update(f)
1519                     else:
1520                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1521         self._sort_formats(formats)
1522         return formats
1523
1524     def _live_title(self, name):
1525         """ Generate the title for a live video """
1526         now = datetime.datetime.now()
1527         now_str = now.strftime('%Y-%m-%d %H:%M')
1528         return name + ' ' + now_str
1529
1530     def _int(self, v, name, fatal=False, **kwargs):
1531         res = int_or_none(v, **kwargs)
1532         if 'get_attr' in kwargs:
1533             print(getattr(v, kwargs['get_attr']))
1534         if res is None:
1535             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1536             if fatal:
1537                 raise ExtractorError(msg)
1538             else:
1539                 self._downloader.report_warning(msg)
1540         return res
1541
1542     def _float(self, v, name, fatal=False, **kwargs):
1543         res = float_or_none(v, **kwargs)
1544         if res is None:
1545             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1546             if fatal:
1547                 raise ExtractorError(msg)
1548             else:
1549                 self._downloader.report_warning(msg)
1550         return res
1551
1552     def _set_cookie(self, domain, name, value, expire_time=None):
1553         cookie = compat_cookiejar.Cookie(
1554             0, name, value, None, None, domain, None,
1555             None, '/', True, False, expire_time, '', None, None, None)
1556         self._downloader.cookiejar.set_cookie(cookie)
1557
1558     def _get_cookies(self, url):
1559         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1560         req = sanitized_Request(url)
1561         self._downloader.cookiejar.add_cookie_header(req)
1562         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1563
1564     def get_testcases(self, include_onlymatching=False):
1565         t = getattr(self, '_TEST', None)
1566         if t:
1567             assert not hasattr(self, '_TESTS'), \
1568                 '%s has _TEST and _TESTS' % type(self).__name__
1569             tests = [t]
1570         else:
1571             tests = getattr(self, '_TESTS', [])
1572         for t in tests:
1573             if not include_onlymatching and t.get('only_matching', False):
1574                 continue
1575             t['name'] = type(self).__name__[:-len('IE')]
1576             yield t
1577
1578     def is_suitable(self, age_limit):
1579         """ Test whether the extractor is generally suitable for the given
1580         age limit (i.e. pornographic sites are not, all others usually are) """
1581
1582         any_restricted = False
1583         for tc in self.get_testcases(include_onlymatching=False):
1584             if 'playlist' in tc:
1585                 tc = tc['playlist'][0]
1586             is_restricted = age_restricted(
1587                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1588             if not is_restricted:
1589                 return True
1590             any_restricted = any_restricted or is_restricted
1591         return not any_restricted
1592
1593     def extract_subtitles(self, *args, **kwargs):
1594         if (self._downloader.params.get('writesubtitles', False) or
1595                 self._downloader.params.get('listsubtitles')):
1596             return self._get_subtitles(*args, **kwargs)
1597         return {}
1598
1599     def _get_subtitles(self, *args, **kwargs):
1600         raise NotImplementedError('This method must be implemented by subclasses')
1601
1602     @staticmethod
1603     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1604         """ Merge subtitle items for one language. Items with duplicated URLs
1605         will be dropped. """
1606         list1_urls = set([item['url'] for item in subtitle_list1])
1607         ret = list(subtitle_list1)
1608         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1609         return ret
1610
1611     @classmethod
1612     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1613         """ Merge two subtitle dictionaries, language by language. """
1614         ret = dict(subtitle_dict1)
1615         for lang in subtitle_dict2:
1616             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1617         return ret
1618
1619     def extract_automatic_captions(self, *args, **kwargs):
1620         if (self._downloader.params.get('writeautomaticsub', False) or
1621                 self._downloader.params.get('listsubtitles')):
1622             return self._get_automatic_captions(*args, **kwargs)
1623         return {}
1624
1625     def _get_automatic_captions(self, *args, **kwargs):
1626         raise NotImplementedError('This method must be implemented by subclasses')
1627
1628     def mark_watched(self, *args, **kwargs):
1629         if (self._downloader.params.get('mark_watched', False) and
1630                 (self._get_login_info()[0] is not None or
1631                     self._downloader.params.get('cookiefile') is not None)):
1632             self._mark_watched(*args, **kwargs)
1633
1634     def _mark_watched(self, *args, **kwargs):
1635         raise NotImplementedError('This method must be implemented by subclasses')
1636
1637
1638 class SearchInfoExtractor(InfoExtractor):
1639     """
1640     Base class for paged search queries extractors.
1641     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1642     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1643     """
1644
1645     @classmethod
1646     def _make_valid_url(cls):
1647         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1648
1649     @classmethod
1650     def suitable(cls, url):
1651         return re.match(cls._make_valid_url(), url) is not None
1652
1653     def _real_extract(self, query):
1654         mobj = re.match(self._make_valid_url(), query)
1655         if mobj is None:
1656             raise ExtractorError('Invalid search query "%s"' % query)
1657
1658         prefix = mobj.group('prefix')
1659         query = mobj.group('query')
1660         if prefix == '':
1661             return self._get_n_results(query, 1)
1662         elif prefix == 'all':
1663             return self._get_n_results(query, self._MAX_RESULTS)
1664         else:
1665             n = int(prefix)
1666             if n <= 0:
1667                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1668             elif n > self._MAX_RESULTS:
1669                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1670                 n = self._MAX_RESULTS
1671             return self._get_n_results(query, n)
1672
1673     def _get_n_results(self, query, n):
1674         """Get a specified number of results for a query"""
1675         raise NotImplementedError('This method must be implemented by subclasses')
1676
1677     @property
1678     def SEARCH_KEY(self):
1679         return self._SEARCH_KEY