git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49     mimetype2ext,
  50 )
  51
  52
  53 class InfoExtractor(object):
  54     """Information Extractor class.
  55
  56     Information extractors are the classes that, given a URL, extract
  57     information about the video (or videos) the URL refers to. This
  58     information includes the real video URL, the video title, author and
  59     others. The information is stored in a dictionary which is then
  60     passed to the YoutubeDL. The YoutubeDL processes this
  61     information possibly downloading the video to the file system, among
  62     other possible outcomes.
  63
  64     The type field determines the type of the result.
  65     By far the most common value (and the default if _type is missing) is
  66     "video", which indicates a single video.
  67
  68     For a video, the dictionaries must include the following fields:
  69
  70     id:             Video identifier.
  71     title:          Video title, unescaped.
  72
  73     Additionally, it must contain either a formats entry or a url one:
  74
  75     formats:        A list of dictionaries for each format available, ordered
  76                     from worst to best quality.
  77
  78                     Potential fields:
  79                     * url        Mandatory. The URL of the video file
  80                     * ext        Will be calculated from URL if missing
  81                     * format     A human-readable description of the format
  82                                  ("mp4 container with h264/opus").
  83                                  Calculated from the format_id, width, height.
  84                                  and format_note fields if missing.
  85                     * format_id  A short description of the format
  86                                  ("mp4_h264_opus" or "19").
  87                                 Technically optional, but strongly recommended.
  88                     * format_note Additional info about the format
  89                                  ("3D" or "DASH video")
  90                     * width      Width of the video, if known
  91                     * height     Height of the video, if known
  92                     * resolution Textual description of width and height
  93                     * tbr        Average bitrate of audio and video in KBit/s
  94                     * abr        Average audio bitrate in KBit/s
  95                     * acodec     Name of the audio codec in use
  96                     * asr        Audio sampling rate in Hertz
  97                     * vbr        Average video bitrate in KBit/s
  98                     * fps        Frame rate
  99                     * vcodec     Name of the video codec in use
 100                     * container  Name of the container format
 101                     * filesize   The number of bytes, if known in advance
 102                     * filesize_approx  An estimate for the number of bytes
 103                     * player_url SWF Player URL (used for rtmpdump).
 104                     * protocol   The protocol that will be used for the actual
 105                                  download, lower-case.
 106                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 107                                  "m3u8", or "m3u8_native".
 108                     * preference Order number of this format. If this field is
 109                                  present and not None, the formats get sorted
 110                                  by this field, regardless of all other values.
 111                                  -1 for default (order by other properties),
 112                                  -2 or smaller for less than default.
 113                                  < -1000 to hide the format (if there is
 114                                     another one which is strictly better)
 115                     * language   Language code, e.g. "de" or "en-US".
 116                     * language_preference  Is this in the language mentioned in
 117                                  the URL?
 118                                  10 if it's what the URL is about,
 119                                  -1 for default (don't know),
 120                                  -10 otherwise, other values reserved for now.
 121                     * quality    Order number of the video quality of this
 122                                  format, irrespective of the file format.
 123                                  -1 for default (order by other properties),
 124                                  -2 or smaller for less than default.
 125                     * source_preference  Order number for this video source
 126                                   (quality takes higher priority)
 127                                  -1 for default (order by other properties),
 128                                  -2 or smaller for less than default.
 129                     * http_headers  A dictionary of additional HTTP headers
 130                                  to add to the request.
 131                     * stretched_ratio  If given and not 1, indicates that the
 132                                  video's pixels are not square.
 133                                  width : height ratio as float.
 134                     * no_resume  The server does not support resuming the
 135                                  (HTTP or RTMP) download. Boolean.
 136
 137     url:            Final video URL.
 138     ext:            Video filename extension.
 139     format:         The video format, defaults to ext (used for --get-format)
 140     player_url:     SWF Player URL (used for rtmpdump).
 141
 142     The following fields are optional:
 143
 144     alt_title:      A secondary title of the video.
 145     display_id      An alternative identifier for the video, not necessarily
 146                     unique, but available before title. Typically, id is
 147                     something like "4234987", title "Dancing naked mole rats",
 148                     and display_id "dancing-naked-mole-rats"
 149     thumbnails:     A list of dictionaries, with the following entries:
 150                         * "id" (optional, string) - Thumbnail format ID
 151                         * "url"
 152                         * "preference" (optional, int) - quality of the image
 153                         * "width" (optional, int)
 154                         * "height" (optional, int)
 155                         * "resolution" (optional, string "{width}x{height"},
 156                                         deprecated)
 157     thumbnail:      Full URL to a video thumbnail image.
 158     description:    Full video description.
 159     uploader:       Full name of the video uploader.
 160     creator:        The main artist who created the video.
 161     release_date:   The date (YYYYMMDD) when the video was released.
 162     timestamp:      UNIX timestamp of the moment the video became available.
 163     upload_date:    Video upload date (YYYYMMDD).
 164                     If not explicitly set, calculated from timestamp.
 165     uploader_id:    Nickname or id of the video uploader.
 166     location:       Physical location where the video was filmed.
 167     subtitles:      The available subtitles as a dictionary in the format
 168                     {language: subformats}. "subformats" is a list sorted from
 169                     lower to higher preference, each element is a dictionary
 170                     with the "ext" entry and one of:
 171                         * "data": The subtitles file contents
 172                         * "url": A URL pointing to the subtitles file
 173                     "ext" will be calculated from URL if missing
 174     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 175                     automatically generated captions
 176     duration:       Length of the video in seconds, as an integer or float.
 177     view_count:     How many users have watched the video on the platform.
 178     like_count:     Number of positive ratings of the video
 179     dislike_count:  Number of negative ratings of the video
 180     repost_count:   Number of reposts of the video
 181     average_rating: Average rating give by users, the scale used depends on the webpage
 182     comment_count:  Number of comments on the video
 183     comments:       A list of comments, each with one or more of the following
 184                     properties (all but one of text or html optional):
 185                         * "author" - human-readable name of the comment author
 186                         * "author_id" - user ID of the comment author
 187                         * "id" - Comment ID
 188                         * "html" - Comment as HTML
 189                         * "text" - Plain text of the comment
 190                         * "timestamp" - UNIX timestamp of comment
 191                         * "parent" - ID of the comment this one is replying to.
 192                                      Set to "root" to indicate that this is a
 193                                      comment to the original video.
 194     age_limit:      Age restriction for the video, as an integer (years)
 195     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 196                     should allow to get the same result again. (It will be set
 197                     by YoutubeDL if it's missing)
 198     categories:     A list of categories that the video falls in, for example
 199                     ["Sports", "Berlin"]
 200     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 201     is_live:        True, False, or None (=unknown). Whether this video is a
 202                     live stream that goes on instead of a fixed-length video.
 203     start_time:     Time in seconds where the reproduction should start, as
 204                     specified in the URL.
 205     end_time:       Time in seconds where the reproduction should end, as
 206                     specified in the URL.
 207
 208     The following fields should only be used when the video belongs to some logical
 209     chapter or section:
 210
 211     chapter:        Name or title of the chapter the video belongs to.
 212     chapter_number: Number of the chapter the video belongs to, as an integer.
 213     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 214
 215     The following fields should only be used when the video is an episode of some
 216     series or programme:
 217
 218     series:         Title of the series or programme the video episode belongs to.
 219     season:         Title of the season the video episode belongs to.
 220     season_number:  Number of the season the video episode belongs to, as an integer.
 221     season_id:      Id of the season the video episode belongs to, as a unicode string.
 222     episode:        Title of the video episode. Unlike mandatory video title field,
 223                     this field should denote the exact title of the video episode
 224                     without any kind of decoration.
 225     episode_number: Number of the video episode within a season, as an integer.
 226     episode_id:     Id of the video episode, as a unicode string.
 227
 228     Unless mentioned otherwise, the fields should be Unicode strings.
 229
 230     Unless mentioned otherwise, None is equivalent to absence of information.
 231
 232
 233     _type "playlist" indicates multiple videos.
 234     There must be a key "entries", which is a list, an iterable, or a PagedList
 235     object, each element of which is a valid dictionary by this specification.
 236
 237     Additionally, playlists can have "title", "description" and "id" attributes
 238     with the same semantics as videos (see above).
 239
 240
 241     _type "multi_video" indicates that there are multiple videos that
 242     form a single show, for examples multiple acts of an opera or TV episode.
 243     It must have an entries key like a playlist and contain all the keys
 244     required for a video at the same time.
 245
 246
 247     _type "url" indicates that the video must be extracted from another
 248     location, possibly by a different extractor. Its only required key is:
 249     "url" - the next URL to extract.
 250     The key "ie_key" can be set to the class name (minus the trailing "IE",
 251     e.g. "Youtube") if the extractor class is known in advance.
 252     Additionally, the dictionary may have any properties of the resolved entity
 253     known in advance, for example "title" if the title of the referred video is
 254     known ahead of time.
 255
 256
 257     _type "url_transparent" entities have the same specification as "url", but
 258     indicate that the given additional information is more precise than the one
 259     associated with the resolved URL.
 260     This is useful when a site employs a video service that hosts the video and
 261     its technical metadata, but that video service does not embed a useful
 262     title, description etc.
 263
 264
 265     Subclasses of this one should re-define the _real_initialize() and
 266     _real_extract() methods and define a _VALID_URL regexp.
 267     Probably, they should also be added to the list of extractors.
 268
 269     Finally, the _WORKING attribute should be set to False for broken IEs
 270     in order to warn the users and skip the tests.
 271     """
 272
 273     _ready = False
 274     _downloader = None
 275     _WORKING = True
 276
 277     def __init__(self, downloader=None):
 278         """Constructor. Receives an optional downloader."""
 279         self._ready = False
 280         self.set_downloader(downloader)
 281
 282     @classmethod
 283     def suitable(cls, url):
 284         """Receives a URL and returns True if suitable for this IE."""
 285
 286         # This does not use has/getattr intentionally - we want to know whether
 287         # we have cached the regexp for *this* class, whereas getattr would also
 288         # match the superclass
 289         if '_VALID_URL_RE' not in cls.__dict__:
 290             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 291         return cls._VALID_URL_RE.match(url) is not None
 292
 293     @classmethod
 294     def _match_id(cls, url):
 295         if '_VALID_URL_RE' not in cls.__dict__:
 296             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 297         m = cls._VALID_URL_RE.match(url)
 298         assert m
 299         return m.group('id')
 300
 301     @classmethod
 302     def working(cls):
 303         """Getter method for _WORKING."""
 304         return cls._WORKING
 305
 306     def initialize(self):
 307         """Initializes an instance (authentication, etc)."""
 308         if not self._ready:
 309             self._real_initialize()
 310             self._ready = True
 311
 312     def extract(self, url):
 313         """Extracts URL information and returns it in list of dicts."""
 314         try:
 315             self.initialize()
 316             return self._real_extract(url)
 317         except ExtractorError:
 318             raise
 319         except compat_http_client.IncompleteRead as e:
 320             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 321         except (KeyError, StopIteration) as e:
 322             raise ExtractorError('An extractor error has occurred.', cause=e)
 323
 324     def set_downloader(self, downloader):
 325         """Sets the downloader for this IE."""
 326         self._downloader = downloader
 327
 328     def _real_initialize(self):
 329         """Real initialization process. Redefine in subclasses."""
 330         pass
 331
 332     def _real_extract(self, url):
 333         """Real extraction process. Redefine in subclasses."""
 334         pass
 335
 336     @classmethod
 337     def ie_key(cls):
 338         """A string for getting the InfoExtractor with get_info_extractor"""
 339         return compat_str(cls.__name__[:-2])
 340
 341     @property
 342     def IE_NAME(self):
 343         return compat_str(type(self).__name__[:-2])
 344
 345     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 346         """ Returns the response handle """
 347         if note is None:
 348             self.report_download_webpage(video_id)
 349         elif note is not False:
 350             if video_id is None:
 351                 self.to_screen('%s' % (note,))
 352             else:
 353                 self.to_screen('%s: %s' % (video_id, note))
 354         try:
 355             return self._downloader.urlopen(url_or_request)
 356         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 357             if errnote is False:
 358                 return False
 359             if errnote is None:
 360                 errnote = 'Unable to download webpage'
 361
 362             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 363             if fatal:
 364                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 365             else:
 366                 self._downloader.report_warning(errmsg)
 367                 return False
 368
 369     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 370         """ Returns a tuple (page content as string, URL handle) """
 371         # Strip hashes from the URL (#1038)
 372         if isinstance(url_or_request, (compat_str, str)):
 373             url_or_request = url_or_request.partition('#')[0]
 374
 375         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 376         if urlh is False:
 377             assert not fatal
 378             return False
 379         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 380         return (content, urlh)
 381
 382     @staticmethod
 383     def _guess_encoding_from_content(content_type, webpage_bytes):
 384         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 385         if m:
 386             encoding = m.group(1)
 387         else:
 388             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 389                           webpage_bytes[:1024])
 390             if m:
 391                 encoding = m.group(1).decode('ascii')
 392             elif webpage_bytes.startswith(b'\xff\xfe'):
 393                 encoding = 'utf-16'
 394             else:
 395                 encoding = 'utf-8'
 396
 397         return encoding
 398
 399     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 400         content_type = urlh.headers.get('Content-Type', '')
 401         webpage_bytes = urlh.read()
 402         if prefix is not None:
 403             webpage_bytes = prefix + webpage_bytes
 404         if not encoding:
 405             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 406         if self._downloader.params.get('dump_intermediate_pages', False):
 407             try:
 408                 url = url_or_request.get_full_url()
 409             except AttributeError:
 410                 url = url_or_request
 411             self.to_screen('Dumping request to ' + url)
 412             dump = base64.b64encode(webpage_bytes).decode('ascii')
 413             self._downloader.to_screen(dump)
 414         if self._downloader.params.get('write_pages', False):
 415             try:
 416                 url = url_or_request.get_full_url()
 417             except AttributeError:
 418                 url = url_or_request
 419             basen = '%s_%s' % (video_id, url)
 420             if len(basen) > 240:
 421                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 422                 basen = basen[:240 - len(h)] + h
 423             raw_filename = basen + '.dump'
 424             filename = sanitize_filename(raw_filename, restricted=True)
 425             self.to_screen('Saving request to ' + filename)
 426             # Working around MAX_PATH limitation on Windows (see
 427             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 428             if os.name == 'nt':
 429                 absfilepath = os.path.abspath(filename)
 430                 if len(absfilepath) > 259:
 431                     filename = '\\\\?\\' + absfilepath
 432             with open(filename, 'wb') as outf:
 433                 outf.write(webpage_bytes)
 434
 435         try:
 436             content = webpage_bytes.decode(encoding, 'replace')
 437         except LookupError:
 438             content = webpage_bytes.decode('utf-8', 'replace')
 439
 440         if ('<title>Access to this site is blocked</title>' in content and
 441                 'Websense' in content[:512]):
 442             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 443             blocked_iframe = self._html_search_regex(
 444                 r'<iframe src="([^"]+)"', content,
 445                 'Websense information URL', default=None)
 446             if blocked_iframe:
 447                 msg += ' Visit %s for more details' % blocked_iframe
 448             raise ExtractorError(msg, expected=True)
 449         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 450             msg = (
 451                 'Access to this webpage has been blocked by Indian censorship. '
 452                 'Use a VPN or proxy server (with --proxy) to route around it.')
 453             block_msg = self._html_search_regex(
 454                 r'</h1><p>(.*?)</p>',
 455                 content, 'block message', default=None)
 456             if block_msg:
 457                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 458             raise ExtractorError(msg, expected=True)
 459
 460         return content
 461
 462     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 463         """ Returns the data of the page as a string """
 464         success = False
 465         try_count = 0
 466         while success is False:
 467             try:
 468                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 469                 success = True
 470             except compat_http_client.IncompleteRead as e:
 471                 try_count += 1
 472                 if try_count >= tries:
 473                     raise e
 474                 self._sleep(timeout, video_id)
 475         if res is False:
 476             return res
 477         else:
 478             content, _ = res
 479             return content
 480
 481     def _download_xml(self, url_or_request, video_id,
 482                       note='Downloading XML', errnote='Unable to download XML',
 483                       transform_source=None, fatal=True, encoding=None):
 484         """Return the xml as an xml.etree.ElementTree.Element"""
 485         xml_string = self._download_webpage(
 486             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 487         if xml_string is False:
 488             return xml_string
 489         if transform_source:
 490             xml_string = transform_source(xml_string)
 491         return compat_etree_fromstring(xml_string.encode('utf-8'))
 492
 493     def _download_json(self, url_or_request, video_id,
 494                        note='Downloading JSON metadata',
 495                        errnote='Unable to download JSON metadata',
 496                        transform_source=None,
 497                        fatal=True, encoding=None):
 498         json_string = self._download_webpage(
 499             url_or_request, video_id, note, errnote, fatal=fatal,
 500             encoding=encoding)
 501         if (not fatal) and json_string is False:
 502             return None
 503         return self._parse_json(
 504             json_string, video_id, transform_source=transform_source, fatal=fatal)
 505
 506     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 507         if transform_source:
 508             json_string = transform_source(json_string)
 509         try:
 510             return json.loads(json_string)
 511         except ValueError as ve:
 512             errmsg = '%s: Failed to parse JSON ' % video_id
 513             if fatal:
 514                 raise ExtractorError(errmsg, cause=ve)
 515             else:
 516                 self.report_warning(errmsg + str(ve))
 517
 518     def report_warning(self, msg, video_id=None):
 519         idstr = '' if video_id is None else '%s: ' % video_id
 520         self._downloader.report_warning(
 521             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 522
 523     def to_screen(self, msg):
 524         """Print msg to screen, prefixing it with '[ie_name]'"""
 525         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 526
 527     def report_extraction(self, id_or_name):
 528         """Report information extraction."""
 529         self.to_screen('%s: Extracting information' % id_or_name)
 530
 531     def report_download_webpage(self, video_id):
 532         """Report webpage download."""
 533         self.to_screen('%s: Downloading webpage' % video_id)
 534
 535     def report_age_confirmation(self):
 536         """Report attempt to confirm age."""
 537         self.to_screen('Confirming age')
 538
 539     def report_login(self):
 540         """Report attempt to log in."""
 541         self.to_screen('Logging in')
 542
 543     @staticmethod
 544     def raise_login_required(msg='This video is only available for registered users'):
 545         raise ExtractorError(
 546             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 547             expected=True)
 548
 549     @staticmethod
 550     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 551         raise ExtractorError(
 552             '%s. You might want to use --proxy to workaround.' % msg,
 553             expected=True)
 554
 555     # Methods for following #608
 556     @staticmethod
 557     def url_result(url, ie=None, video_id=None, video_title=None):
 558         """Returns a URL that points to a page that should be processed"""
 559         # TODO: ie should be the class used for getting the info
 560         video_info = {'_type': 'url',
 561                       'url': url,
 562                       'ie_key': ie}
 563         if video_id is not None:
 564             video_info['id'] = video_id
 565         if video_title is not None:
 566             video_info['title'] = video_title
 567         return video_info
 568
 569     @staticmethod
 570     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 571         """Returns a playlist"""
 572         video_info = {'_type': 'playlist',
 573                       'entries': entries}
 574         if playlist_id:
 575             video_info['id'] = playlist_id
 576         if playlist_title:
 577             video_info['title'] = playlist_title
 578         if playlist_description:
 579             video_info['description'] = playlist_description
 580         return video_info
 581
 582     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 583         """
 584         Perform a regex search on the given string, using a single or a list of
 585         patterns returning the first matching group.
 586         In case of failure return a default value or raise a WARNING or a
 587         RegexNotFoundError, depending on fatal, specifying the field name.
 588         """
 589         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 590             mobj = re.search(pattern, string, flags)
 591         else:
 592             for p in pattern:
 593                 mobj = re.search(p, string, flags)
 594                 if mobj:
 595                     break
 596
 597         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 598             _name = '\033[0;34m%s\033[0m' % name
 599         else:
 600             _name = name
 601
 602         if mobj:
 603             if group is None:
 604                 # return the first matching group
 605                 return next(g for g in mobj.groups() if g is not None)
 606             else:
 607                 return mobj.group(group)
 608         elif default is not NO_DEFAULT:
 609             return default
 610         elif fatal:
 611             raise RegexNotFoundError('Unable to extract %s' % _name)
 612         else:
 613             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 614             return None
 615
 616     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 617         """
 618         Like _search_regex, but strips HTML tags and unescapes entities.
 619         """
 620         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 621         if res:
 622             return clean_html(res).strip()
 623         else:
 624             return res
 625
 626     def _get_login_info(self):
 627         """
 628         Get the login info as (username, password)
 629         It will look in the netrc file using the _NETRC_MACHINE value
 630         If there's no info available, return (None, None)
 631         """
 632         if self._downloader is None:
 633             return (None, None)
 634
 635         username = None
 636         password = None
 637         downloader_params = self._downloader.params
 638
 639         # Attempt to use provided username and password or .netrc data
 640         if downloader_params.get('username') is not None:
 641             username = downloader_params['username']
 642             password = downloader_params['password']
 643         elif downloader_params.get('usenetrc', False):
 644             try:
 645                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 646                 if info is not None:
 647                     username = info[0]
 648                     password = info[2]
 649                 else:
 650                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 651             except (IOError, netrc.NetrcParseError) as err:
 652                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 653
 654         return (username, password)
 655
 656     def _get_tfa_info(self, note='two-factor verification code'):
 657         """
 658         Get the two-factor authentication info
 659         TODO - asking the user will be required for sms/phone verify
 660         currently just uses the command line option
 661         If there's no info available, return None
 662         """
 663         if self._downloader is None:
 664             return None
 665         downloader_params = self._downloader.params
 666
 667         if downloader_params.get('twofactor') is not None:
 668             return downloader_params['twofactor']
 669
 670         return compat_getpass('Type %s and press [Return]: ' % note)
 671
 672     # Helper functions for extracting OpenGraph info
 673     @staticmethod
 674     def _og_regexes(prop):
 675         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 676         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 677                        % {'prop': re.escape(prop)})
 678         template = r'<meta[^>]+?%s[^>]+?%s'
 679         return [
 680             template % (property_re, content_re),
 681             template % (content_re, property_re),
 682         ]
 683
 684     @staticmethod
 685     def _meta_regex(prop):
 686         return r'''(?isx)<meta
 687                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 688                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 689
 690     def _og_search_property(self, prop, html, name=None, **kargs):
 691         if name is None:
 692             name = 'OpenGraph %s' % prop
 693         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 694         if escaped is None:
 695             return None
 696         return unescapeHTML(escaped)
 697
 698     def _og_search_thumbnail(self, html, **kargs):
 699         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 700
 701     def _og_search_description(self, html, **kargs):
 702         return self._og_search_property('description', html, fatal=False, **kargs)
 703
 704     def _og_search_title(self, html, **kargs):
 705         return self._og_search_property('title', html, **kargs)
 706
 707     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 708         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 709         if secure:
 710             regexes = self._og_regexes('video:secure_url') + regexes
 711         return self._html_search_regex(regexes, html, name, **kargs)
 712
 713     def _og_search_url(self, html, **kargs):
 714         return self._og_search_property('url', html, **kargs)
 715
 716     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 717         if display_name is None:
 718             display_name = name
 719         return self._html_search_regex(
 720             self._meta_regex(name),
 721             html, display_name, fatal=fatal, group='content', **kwargs)
 722
 723     def _dc_search_uploader(self, html):
 724         return self._html_search_meta('dc.creator', html, 'uploader')
 725
 726     def _rta_search(self, html):
 727         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 728         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 729                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 730                      html):
 731             return 18
 732         return 0
 733
 734     def _media_rating_search(self, html):
 735         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 736         rating = self._html_search_meta('rating', html)
 737
 738         if not rating:
 739             return None
 740
 741         RATING_TABLE = {
 742             'safe for kids': 0,
 743             'general': 8,
 744             '14 years': 14,
 745             'mature': 17,
 746             'restricted': 19,
 747         }
 748         return RATING_TABLE.get(rating.lower())
 749
 750     def _family_friendly_search(self, html):
 751         # See http://schema.org/VideoObject
 752         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 753
 754         if not family_friendly:
 755             return None
 756
 757         RATING_TABLE = {
 758             '1': 0,
 759             'true': 0,
 760             '0': 18,
 761             'false': 18,
 762         }
 763         return RATING_TABLE.get(family_friendly.lower())
 764
 765     def _twitter_search_player(self, html):
 766         return self._html_search_meta('twitter:player', html,
 767                                       'twitter card player')
 768
 769     def _search_json_ld(self, html, video_id, **kwargs):
 770         json_ld = self._search_regex(
 771             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 772             html, 'JSON-LD', group='json_ld', **kwargs)
 773         if not json_ld:
 774             return {}
 775         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 776
 777     def _json_ld(self, json_ld, video_id, fatal=True):
 778         if isinstance(json_ld, compat_str):
 779             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 780         if not json_ld:
 781             return {}
 782         info = {}
 783         if json_ld.get('@context') == 'http://schema.org':
 784             item_type = json_ld.get('@type')
 785             if item_type == 'TVEpisode':
 786                 info.update({
 787                     'episode': unescapeHTML(json_ld.get('name')),
 788                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 789                     'description': unescapeHTML(json_ld.get('description')),
 790                 })
 791                 part_of_season = json_ld.get('partOfSeason')
 792                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 793                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 794                 part_of_series = json_ld.get('partOfSeries')
 795                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 796                     info['series'] = unescapeHTML(part_of_series.get('name'))
 797             elif item_type == 'Article':
 798                 info.update({
 799                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 800                     'title': unescapeHTML(json_ld.get('headline')),
 801                     'description': unescapeHTML(json_ld.get('articleBody')),
 802                 })
 803         return dict((k, v) for k, v in info.items() if v is not None)
 804
 805     @staticmethod
 806     def _hidden_inputs(html):
 807         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 808         hidden_inputs = {}
 809         for input in re.findall(r'(?i)<input([^>]+)>', html):
 810             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 811                 continue
 812             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 813             if not name:
 814                 continue
 815             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 816             if not value:
 817                 continue
 818             hidden_inputs[name.group('value')] = value.group('value')
 819         return hidden_inputs
 820
 821     def _form_hidden_inputs(self, form_id, html):
 822         form = self._search_regex(
 823             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 824             html, '%s form' % form_id, group='form')
 825         return self._hidden_inputs(form)
 826
 827     def _sort_formats(self, formats, field_preference=None):
 828         if not formats:
 829             raise ExtractorError('No video formats found')
 830
 831         for f in formats:
 832             # Automatically determine tbr when missing based on abr and vbr (improves
 833             # formats sorting in some cases)
 834             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 835                 f['tbr'] = f['abr'] + f['vbr']
 836
 837         def _formats_key(f):
 838             # TODO remove the following workaround
 839             from ..utils import determine_ext
 840             if not f.get('ext') and 'url' in f:
 841                 f['ext'] = determine_ext(f['url'])
 842
 843             if isinstance(field_preference, (list, tuple)):
 844                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 845
 846             preference = f.get('preference')
 847             if preference is None:
 848                 preference = 0
 849                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 850                     preference -= 0.5
 851
 852             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 853
 854             if f.get('vcodec') == 'none':  # audio only
 855                 if self._downloader.params.get('prefer_free_formats'):
 856                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 857                 else:
 858                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 859                 ext_preference = 0
 860                 try:
 861                     audio_ext_preference = ORDER.index(f['ext'])
 862                 except ValueError:
 863                     audio_ext_preference = -1
 864             else:
 865                 if self._downloader.params.get('prefer_free_formats'):
 866                     ORDER = ['flv', 'mp4', 'webm']
 867                 else:
 868                     ORDER = ['webm', 'flv', 'mp4']
 869                 try:
 870                     ext_preference = ORDER.index(f['ext'])
 871                 except ValueError:
 872                     ext_preference = -1
 873                 audio_ext_preference = 0
 874
 875             return (
 876                 preference,
 877                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 878                 f.get('quality') if f.get('quality') is not None else -1,
 879                 f.get('tbr') if f.get('tbr') is not None else -1,
 880                 f.get('filesize') if f.get('filesize') is not None else -1,
 881                 f.get('vbr') if f.get('vbr') is not None else -1,
 882                 f.get('height') if f.get('height') is not None else -1,
 883                 f.get('width') if f.get('width') is not None else -1,
 884                 proto_preference,
 885                 ext_preference,
 886                 f.get('abr') if f.get('abr') is not None else -1,
 887                 audio_ext_preference,
 888                 f.get('fps') if f.get('fps') is not None else -1,
 889                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 890                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 891                 f.get('format_id') if f.get('format_id') is not None else '',
 892             )
 893         formats.sort(key=_formats_key)
 894
 895     def _check_formats(self, formats, video_id):
 896         if formats:
 897             formats[:] = filter(
 898                 lambda f: self._is_valid_url(
 899                     f['url'], video_id,
 900                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 901                 formats)
 902
 903     @staticmethod
 904     def _remove_duplicate_formats(formats):
 905         format_urls = set()
 906         unique_formats = []
 907         for f in formats:
 908             if f['url'] not in format_urls:
 909                 format_urls.add(f['url'])
 910                 unique_formats.append(f)
 911         formats[:] = unique_formats
 912
 913     def _is_valid_url(self, url, video_id, item='video'):
 914         url = self._proto_relative_url(url, scheme='http:')
 915         # For now assume non HTTP(S) URLs always valid
 916         if not (url.startswith('http://') or url.startswith('https://')):
 917             return True
 918         try:
 919             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 920             return True
 921         except ExtractorError as e:
 922             if isinstance(e.cause, compat_urllib_error.URLError):
 923                 self.to_screen(
 924                     '%s: %s URL is invalid, skipping' % (video_id, item))
 925                 return False
 926             raise
 927
 928     def http_scheme(self):
 929         """ Either "http:" or "https:", depending on the user's preferences """
 930         return (
 931             'http:'
 932             if self._downloader.params.get('prefer_insecure', False)
 933             else 'https:')
 934
 935     def _proto_relative_url(self, url, scheme=None):
 936         if url is None:
 937             return url
 938         if url.startswith('//'):
 939             if scheme is None:
 940                 scheme = self.http_scheme()
 941             return scheme + url
 942         else:
 943             return url
 944
 945     def _sleep(self, timeout, video_id, msg_template=None):
 946         if msg_template is None:
 947             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 948         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 949         self.to_screen(msg)
 950         time.sleep(timeout)
 951
 952     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 953                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 954                              fatal=True):
 955         manifest = self._download_xml(
 956             manifest_url, video_id, 'Downloading f4m manifest',
 957             'Unable to download f4m manifest',
 958             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 959             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 960             transform_source=transform_source,
 961             fatal=fatal)
 962
 963         if manifest is False:
 964             return []
 965
 966         formats = []
 967         manifest_version = '1.0'
 968         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 969         if not media_nodes:
 970             manifest_version = '2.0'
 971             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 972         base_url = xpath_text(
 973             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 974             'base URL', default=None)
 975         if base_url:
 976             base_url = base_url.strip()
 977         for i, media_el in enumerate(media_nodes):
 978             if manifest_version == '2.0':
 979                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 980                 if not media_url:
 981                     continue
 982                 manifest_url = (
 983                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 984                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 985                 # If media_url is itself a f4m manifest do the recursive extraction
 986                 # since bitrates in parent manifest (this one) and media_url manifest
 987                 # may differ leading to inability to resolve the format by requested
 988                 # bitrate in f4m downloader
 989                 if determine_ext(manifest_url) == 'f4m':
 990                     formats.extend(self._extract_f4m_formats(
 991                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 992                     continue
 993             tbr = int_or_none(media_el.attrib.get('bitrate'))
 994             formats.append({
 995                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 996                 'url': manifest_url,
 997                 'ext': 'flv',
 998                 'tbr': tbr,
 999                 'width': int_or_none(media_el.attrib.get('width')),
1000                 'height': int_or_none(media_el.attrib.get('height')),
1001                 'preference': preference,
1002             })
1003         self._sort_formats(formats)
1004
1005         return formats
1006
1007     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1008                               entry_protocol='m3u8', preference=None,
1009                               m3u8_id=None, note=None, errnote=None,
1010                               fatal=True):
1011
1012         formats = [{
1013             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1014             'url': m3u8_url,
1015             'ext': ext,
1016             'protocol': 'm3u8',
1017             'preference': preference - 1 if preference else -1,
1018             'resolution': 'multiple',
1019             'format_note': 'Quality selection URL',
1020         }]
1021
1022         format_url = lambda u: (
1023             u
1024             if re.match(r'^https?://', u)
1025             else compat_urlparse.urljoin(m3u8_url, u))
1026
1027         res = self._download_webpage_handle(
1028             m3u8_url, video_id,
1029             note=note or 'Downloading m3u8 information',
1030             errnote=errnote or 'Failed to download m3u8 information',
1031             fatal=fatal)
1032         if res is False:
1033             return []
1034         m3u8_doc, urlh = res
1035         m3u8_url = urlh.geturl()
1036
1037         # We should try extracting formats only from master playlists [1], i.e.
1038         # playlists that describe available qualities. On the other hand media
1039         # playlists [2] should be returned as is since they contain just the media
1040         # without qualities renditions.
1041         # Fortunately, master playlist can be easily distinguished from media
1042         # playlist based on particular tags availability. As of [1, 2] master
1043         # playlist tags MUST NOT appear in a media playist and vice versa.
1044         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1045         # and MUST NOT appear in master playlist thus we can clearly detect media
1046         # playlist with this criterion.
1047         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1048         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1049         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1050         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1051             return [{
1052                 'url': m3u8_url,
1053                 'format_id': m3u8_id,
1054                 'ext': ext,
1055                 'protocol': entry_protocol,
1056                 'preference': preference,
1057             }]
1058         last_info = None
1059         last_media = None
1060         kv_rex = re.compile(
1061             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1062         for line in m3u8_doc.splitlines():
1063             if line.startswith('#EXT-X-STREAM-INF:'):
1064                 last_info = {}
1065                 for m in kv_rex.finditer(line):
1066                     v = m.group('val')
1067                     if v.startswith('"'):
1068                         v = v[1:-1]
1069                     last_info[m.group('key')] = v
1070             elif line.startswith('#EXT-X-MEDIA:'):
1071                 last_media = {}
1072                 for m in kv_rex.finditer(line):
1073                     v = m.group('val')
1074                     if v.startswith('"'):
1075                         v = v[1:-1]
1076                     last_media[m.group('key')] = v
1077             elif line.startswith('#') or not line.strip():
1078                 continue
1079             else:
1080                 if last_info is None:
1081                     formats.append({'url': format_url(line)})
1082                     continue
1083                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1084                 format_id = []
1085                 if m3u8_id:
1086                     format_id.append(m3u8_id)
1087                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1088                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1089                 f = {
1090                     'format_id': '-'.join(format_id),
1091                     'url': format_url(line.strip()),
1092                     'tbr': tbr,
1093                     'ext': ext,
1094                     'protocol': entry_protocol,
1095                     'preference': preference,
1096                 }
1097                 resolution = last_info.get('RESOLUTION')
1098                 if resolution:
1099                     width_str, height_str = resolution.split('x')
1100                     f['width'] = int(width_str)
1101                     f['height'] = int(height_str)
1102                 codecs = last_info.get('CODECS')
1103                 if codecs:
1104                     vcodec, acodec = [None] * 2
1105                     va_codecs = codecs.split(',')
1106                     if len(va_codecs) == 1:
1107                         # Audio only entries usually come with single codec and
1108                         # no resolution. For more robustness we also check it to
1109                         # be mp4 audio.
1110                         if not resolution and va_codecs[0].startswith('mp4a'):
1111                             vcodec, acodec = 'none', va_codecs[0]
1112                         else:
1113                             vcodec = va_codecs[0]
1114                     else:
1115                         vcodec, acodec = va_codecs[:2]
1116                     f.update({
1117                         'acodec': acodec,
1118                         'vcodec': vcodec,
1119                     })
1120                 if last_media is not None:
1121                     f['m3u8_media'] = last_media
1122                     last_media = None
1123                 formats.append(f)
1124                 last_info = {}
1125         self._sort_formats(formats)
1126         return formats
1127
1128     @staticmethod
1129     def _xpath_ns(path, namespace=None):
1130         if not namespace:
1131             return path
1132         out = []
1133         for c in path.split('/'):
1134             if not c or c == '.':
1135                 out.append(c)
1136             else:
1137                 out.append('{%s}%s' % (namespace, c))
1138         return '/'.join(out)
1139
1140     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1141         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1142
1143         if smil is False:
1144             assert not fatal
1145             return []
1146
1147         namespace = self._parse_smil_namespace(smil)
1148
1149         return self._parse_smil_formats(
1150             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1151
1152     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1153         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1154         if smil is False:
1155             return {}
1156         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1157
1158     def _download_smil(self, smil_url, video_id, fatal=True):
1159         return self._download_xml(
1160             smil_url, video_id, 'Downloading SMIL file',
1161             'Unable to download SMIL file', fatal=fatal)
1162
1163     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1164         namespace = self._parse_smil_namespace(smil)
1165
1166         formats = self._parse_smil_formats(
1167             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1168         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1169
1170         video_id = os.path.splitext(url_basename(smil_url))[0]
1171         title = None
1172         description = None
1173         upload_date = None
1174         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1175             name = meta.attrib.get('name')
1176             content = meta.attrib.get('content')
1177             if not name or not content:
1178                 continue
1179             if not title and name == 'title':
1180                 title = content
1181             elif not description and name in ('description', 'abstract'):
1182                 description = content
1183             elif not upload_date and name == 'date':
1184                 upload_date = unified_strdate(content)
1185
1186         thumbnails = [{
1187             'id': image.get('type'),
1188             'url': image.get('src'),
1189             'width': int_or_none(image.get('width')),
1190             'height': int_or_none(image.get('height')),
1191         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1192
1193         return {
1194             'id': video_id,
1195             'title': title or video_id,
1196             'description': description,
1197             'upload_date': upload_date,
1198             'thumbnails': thumbnails,
1199             'formats': formats,
1200             'subtitles': subtitles,
1201         }
1202
1203     def _parse_smil_namespace(self, smil):
1204         return self._search_regex(
1205             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1206
1207     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1208         base = smil_url
1209         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1210             b = meta.get('base') or meta.get('httpBase')
1211             if b:
1212                 base = b
1213                 break
1214
1215         formats = []
1216         rtmp_count = 0
1217         http_count = 0
1218         m3u8_count = 0
1219
1220         srcs = []
1221         videos = smil.findall(self._xpath_ns('.//video', namespace))
1222         for video in videos:
1223             src = video.get('src')
1224             if not src or src in srcs:
1225                 continue
1226             srcs.append(src)
1227
1228             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1229             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1230             width = int_or_none(video.get('width'))
1231             height = int_or_none(video.get('height'))
1232             proto = video.get('proto')
1233             ext = video.get('ext')
1234             src_ext = determine_ext(src)
1235             streamer = video.get('streamer') or base
1236
1237             if proto == 'rtmp' or streamer.startswith('rtmp'):
1238                 rtmp_count += 1
1239                 formats.append({
1240                     'url': streamer,
1241                     'play_path': src,
1242                     'ext': 'flv',
1243                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1244                     'tbr': bitrate,
1245                     'filesize': filesize,
1246                     'width': width,
1247                     'height': height,
1248                 })
1249                 if transform_rtmp_url:
1250                     streamer, src = transform_rtmp_url(streamer, src)
1251                     formats[-1].update({
1252                         'url': streamer,
1253                         'play_path': src,
1254                     })
1255                 continue
1256
1257             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1258             src_url = src_url.strip()
1259
1260             if proto == 'm3u8' or src_ext == 'm3u8':
1261                 m3u8_formats = self._extract_m3u8_formats(
1262                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1263                 if len(m3u8_formats) == 1:
1264                     m3u8_count += 1
1265                     m3u8_formats[0].update({
1266                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1267                         'tbr': bitrate,
1268                         'width': width,
1269                         'height': height,
1270                     })
1271                 formats.extend(m3u8_formats)
1272                 continue
1273
1274             if src_ext == 'f4m':
1275                 f4m_url = src_url
1276                 if not f4m_params:
1277                     f4m_params = {
1278                         'hdcore': '3.2.0',
1279                         'plugin': 'flowplayer-3.2.0.1',
1280                     }
1281                 f4m_url += '&' if '?' in f4m_url else '?'
1282                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1283                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1284                 continue
1285
1286             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1287                 http_count += 1
1288                 formats.append({
1289                     'url': src_url,
1290                     'ext': ext or src_ext or 'flv',
1291                     'format_id': 'http-%d' % (bitrate or http_count),
1292                     'tbr': bitrate,
1293                     'filesize': filesize,
1294                     'width': width,
1295                     'height': height,
1296                 })
1297                 continue
1298
1299         self._sort_formats(formats)
1300
1301         return formats
1302
1303     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1304         urls = []
1305         subtitles = {}
1306         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1307             src = textstream.get('src')
1308             if not src or src in urls:
1309                 continue
1310             urls.append(src)
1311             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1312             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1313             subtitles.setdefault(lang, []).append({
1314                 'url': src,
1315                 'ext': ext,
1316             })
1317         return subtitles
1318
1319     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1320         xspf = self._download_xml(
1321             playlist_url, playlist_id, 'Downloading xpsf playlist',
1322             'Unable to download xspf manifest', fatal=fatal)
1323         if xspf is False:
1324             return []
1325         return self._parse_xspf(xspf, playlist_id)
1326
1327     def _parse_xspf(self, playlist, playlist_id):
1328         NS_MAP = {
1329             'xspf': 'http://xspf.org/ns/0/',
1330             's1': 'http://static.streamone.nl/player/ns/0',
1331         }
1332
1333         entries = []
1334         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1335             title = xpath_text(
1336                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1337             description = xpath_text(
1338                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1339             thumbnail = xpath_text(
1340                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1341             duration = float_or_none(
1342                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1343
1344             formats = [{
1345                 'url': location.text,
1346                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1347                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1348                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1349             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1350             self._sort_formats(formats)
1351
1352             entries.append({
1353                 'id': playlist_id,
1354                 'title': title,
1355                 'description': description,
1356                 'thumbnail': thumbnail,
1357                 'duration': duration,
1358                 'formats': formats,
1359             })
1360         return entries
1361
1362     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1363         res = self._download_webpage_handle(
1364             mpd_url, video_id,
1365             note=note or 'Downloading MPD manifest',
1366             errnote=errnote or 'Failed to download MPD manifest',
1367             fatal=fatal)
1368         if res is False:
1369             return []
1370         mpd, urlh = res
1371         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1372
1373         return self._parse_mpd_formats(
1374             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1375
1376     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1377         if mpd_doc.get('type') == 'dynamic':
1378             return []
1379
1380         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1381
1382         def _add_ns(path):
1383             return self._xpath_ns(path, namespace)
1384
1385         def is_drm_protected(element):
1386             return element.find(_add_ns('ContentProtection')) is not None
1387
1388         def extract_multisegment_info(element, ms_parent_info):
1389             ms_info = ms_parent_info.copy()
1390             segment_list = element.find(_add_ns('SegmentList'))
1391             if segment_list is not None:
1392                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1393                 if segment_urls_e:
1394                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1395                 initialization = segment_list.find(_add_ns('Initialization'))
1396                 if initialization is not None:
1397                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1398             else:
1399                 segment_template = element.find(_add_ns('SegmentTemplate'))
1400                 if segment_template is not None:
1401                     start_number = segment_template.get('startNumber')
1402                     if start_number:
1403                         ms_info['start_number'] = int(start_number)
1404                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1405                     if segment_timeline is not None:
1406                         s_e = segment_timeline.findall(_add_ns('S'))
1407                         if s_e:
1408                             ms_info['total_number'] = 0
1409                             for s in s_e:
1410                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1411                     else:
1412                         timescale = segment_template.get('timescale')
1413                         if timescale:
1414                             ms_info['timescale'] = int(timescale)
1415                         segment_duration = segment_template.get('duration')
1416                         if segment_duration:
1417                             ms_info['segment_duration'] = int(segment_duration)
1418                     media_template = segment_template.get('media')
1419                     if media_template:
1420                         ms_info['media_template'] = media_template
1421                     initialization = segment_template.get('initialization')
1422                     if initialization:
1423                         ms_info['initialization_url'] = initialization
1424                     else:
1425                         initialization = segment_template.find(_add_ns('Initialization'))
1426                         if initialization is not None:
1427                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1428             return ms_info
1429
1430         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1431         formats = []
1432         for period in mpd_doc.findall(_add_ns('Period')):
1433             period_duration = parse_duration(period.get('duration')) or mpd_duration
1434             period_ms_info = extract_multisegment_info(period, {
1435                 'start_number': 1,
1436                 'timescale': 1,
1437             })
1438             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1439                 if is_drm_protected(adaptation_set):
1440                     continue
1441                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1442                 for representation in adaptation_set.findall(_add_ns('Representation')):
1443                     if is_drm_protected(representation):
1444                         continue
1445                     representation_attrib = adaptation_set.attrib.copy()
1446                     representation_attrib.update(representation.attrib)
1447                     mime_type = representation_attrib.get('mimeType')
1448                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1449                     if content_type == 'text':
1450                         # TODO implement WebVTT downloading
1451                         pass
1452                     elif content_type == 'video' or content_type == 'audio':
1453                         base_url = ''
1454                         for element in (representation, adaptation_set, period, mpd_doc):
1455                             base_url_e = element.find(_add_ns('BaseURL'))
1456                             if base_url_e is not None:
1457                                 base_url = base_url_e.text + base_url
1458                                 if re.match(r'^https?://', base_url):
1459                                     break
1460                         if mpd_base_url and not re.match(r'^https?://', base_url):
1461                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1462                                 mpd_base_url += '/'
1463                             base_url = mpd_base_url + base_url
1464                         representation_id = representation_attrib.get('id')
1465                         lang = representation_attrib.get('lang')
1466                         url_el = representation.find(_add_ns('BaseURL'))
1467                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1468                         f = {
1469                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1470                             'url': base_url,
1471                             'width': int_or_none(representation_attrib.get('width')),
1472                             'height': int_or_none(representation_attrib.get('height')),
1473                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1474                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1475                             'fps': int_or_none(representation_attrib.get('frameRate')),
1476                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1477                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1478                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1479                             'format_note': 'DASH %s' % content_type,
1480                             'filesize': filesize,
1481                         }
1482                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1483                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1484                             if 'total_number' not in representation_ms_info and 'segment_duration':
1485                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1486                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1487                             media_template = representation_ms_info['media_template']
1488                             media_template = media_template.replace('$RepresentationID$', representation_id)
1489                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1490                             media_template.replace('$$', '$')
1491                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1492                         if 'segment_urls' in representation_ms_info:
1493                             f.update({
1494                                 'segment_urls': representation_ms_info['segment_urls'],
1495                                 'protocol': 'http_dash_segments',
1496                             })
1497                             if 'initialization_url' in representation_ms_info:
1498                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1499                                 f.update({
1500                                     'initialization_url': initialization_url,
1501                                 })
1502                                 if not f.get('url'):
1503                                     f['url'] = initialization_url
1504                         try:
1505                             existing_format = next(
1506                                 fo for fo in formats
1507                                 if fo['format_id'] == representation_id)
1508                         except StopIteration:
1509                             full_info = formats_dict.get(representation_id, {}).copy()
1510                             full_info.update(f)
1511                             formats.append(full_info)
1512                         else:
1513                             existing_format.update(f)
1514                     else:
1515                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1516         self._sort_formats(formats)
1517         return formats
1518
1519     def _live_title(self, name):
1520         """ Generate the title for a live video """
1521         now = datetime.datetime.now()
1522         now_str = now.strftime('%Y-%m-%d %H:%M')
1523         return name + ' ' + now_str
1524
1525     def _int(self, v, name, fatal=False, **kwargs):
1526         res = int_or_none(v, **kwargs)
1527         if 'get_attr' in kwargs:
1528             print(getattr(v, kwargs['get_attr']))
1529         if res is None:
1530             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1531             if fatal:
1532                 raise ExtractorError(msg)
1533             else:
1534                 self._downloader.report_warning(msg)
1535         return res
1536
1537     def _float(self, v, name, fatal=False, **kwargs):
1538         res = float_or_none(v, **kwargs)
1539         if res is None:
1540             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1541             if fatal:
1542                 raise ExtractorError(msg)
1543             else:
1544                 self._downloader.report_warning(msg)
1545         return res
1546
1547     def _set_cookie(self, domain, name, value, expire_time=None):
1548         cookie = compat_cookiejar.Cookie(
1549             0, name, value, None, None, domain, None,
1550             None, '/', True, False, expire_time, '', None, None, None)
1551         self._downloader.cookiejar.set_cookie(cookie)
1552
1553     def _get_cookies(self, url):
1554         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1555         req = sanitized_Request(url)
1556         self._downloader.cookiejar.add_cookie_header(req)
1557         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1558
1559     def get_testcases(self, include_onlymatching=False):
1560         t = getattr(self, '_TEST', None)
1561         if t:
1562             assert not hasattr(self, '_TESTS'), \
1563                 '%s has _TEST and _TESTS' % type(self).__name__
1564             tests = [t]
1565         else:
1566             tests = getattr(self, '_TESTS', [])
1567         for t in tests:
1568             if not include_onlymatching and t.get('only_matching', False):
1569                 continue
1570             t['name'] = type(self).__name__[:-len('IE')]
1571             yield t
1572
1573     def is_suitable(self, age_limit):
1574         """ Test whether the extractor is generally suitable for the given
1575         age limit (i.e. pornographic sites are not, all others usually are) """
1576
1577         any_restricted = False
1578         for tc in self.get_testcases(include_onlymatching=False):
1579             if 'playlist' in tc:
1580                 tc = tc['playlist'][0]
1581             is_restricted = age_restricted(
1582                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1583             if not is_restricted:
1584                 return True
1585             any_restricted = any_restricted or is_restricted
1586         return not any_restricted
1587
1588     def extract_subtitles(self, *args, **kwargs):
1589         if (self._downloader.params.get('writesubtitles', False) or
1590                 self._downloader.params.get('listsubtitles')):
1591             return self._get_subtitles(*args, **kwargs)
1592         return {}
1593
1594     def _get_subtitles(self, *args, **kwargs):
1595         raise NotImplementedError('This method must be implemented by subclasses')
1596
1597     @staticmethod
1598     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1599         """ Merge subtitle items for one language. Items with duplicated URLs
1600         will be dropped. """
1601         list1_urls = set([item['url'] for item in subtitle_list1])
1602         ret = list(subtitle_list1)
1603         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1604         return ret
1605
1606     @classmethod
1607     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1608         """ Merge two subtitle dictionaries, language by language. """
1609         ret = dict(subtitle_dict1)
1610         for lang in subtitle_dict2:
1611             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1612         return ret
1613
1614     def extract_automatic_captions(self, *args, **kwargs):
1615         if (self._downloader.params.get('writeautomaticsub', False) or
1616                 self._downloader.params.get('listsubtitles')):
1617             return self._get_automatic_captions(*args, **kwargs)
1618         return {}
1619
1620     def _get_automatic_captions(self, *args, **kwargs):
1621         raise NotImplementedError('This method must be implemented by subclasses')
1622
1623
1624 class SearchInfoExtractor(InfoExtractor):
1625     """
1626     Base class for paged search queries extractors.
1627     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1628     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1629     """
1630
1631     @classmethod
1632     def _make_valid_url(cls):
1633         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1634
1635     @classmethod
1636     def suitable(cls, url):
1637         return re.match(cls._make_valid_url(), url) is not None
1638
1639     def _real_extract(self, query):
1640         mobj = re.match(self._make_valid_url(), query)
1641         if mobj is None:
1642             raise ExtractorError('Invalid search query "%s"' % query)
1643
1644         prefix = mobj.group('prefix')
1645         query = mobj.group('query')
1646         if prefix == '':
1647             return self._get_n_results(query, 1)
1648         elif prefix == 'all':
1649             return self._get_n_results(query, self._MAX_RESULTS)
1650         else:
1651             n = int(prefix)
1652             if n <= 0:
1653                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1654             elif n > self._MAX_RESULTS:
1655                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1656                 n = self._MAX_RESULTS
1657             return self._get_n_results(query, n)
1658
1659     def _get_n_results(self, query, n):
1660         """Get a specified number of results for a query"""
1661         raise NotImplementedError('This method must be implemented by subclasses')
1662
1663     @property
1664     def SEARCH_KEY(self):
1665         return self._SEARCH_KEY