_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     url_basename,
  48     xpath_text,
  49     xpath_with_ns,
  50     determine_protocol,
  51     parse_duration,
  52     mimetype2ext,
  53     update_Request,
  54     update_url_query,
  55 )
  56
  57
  58 class InfoExtractor(object):
  59     """Information Extractor class.
  60
  61     Information extractors are the classes that, given a URL, extract
  62     information about the video (or videos) the URL refers to. This
  63     information includes the real video URL, the video title, author and
  64     others. The information is stored in a dictionary which is then
  65     passed to the YoutubeDL. The YoutubeDL processes this
  66     information possibly downloading the video to the file system, among
  67     other possible outcomes.
  68
  69     The type field determines the type of the result.
  70     By far the most common value (and the default if _type is missing) is
  71     "video", which indicates a single video.
  72
  73     For a video, the dictionaries must include the following fields:
  74
  75     id:             Video identifier.
  76     title:          Video title, unescaped.
  77
  78     Additionally, it must contain either a formats entry or a url one:
  79
  80     formats:        A list of dictionaries for each format available, ordered
  81                     from worst to best quality.
  82
  83                     Potential fields:
  84                     * url        Mandatory. The URL of the video file
  85                     * ext        Will be calculated from URL if missing
  86                     * format     A human-readable description of the format
  87                                  ("mp4 container with h264/opus").
  88                                  Calculated from the format_id, width, height.
  89                                  and format_note fields if missing.
  90                     * format_id  A short description of the format
  91                                  ("mp4_h264_opus" or "19").
  92                                 Technically optional, but strongly recommended.
  93                     * format_note Additional info about the format
  94                                  ("3D" or "DASH video")
  95                     * width      Width of the video, if known
  96                     * height     Height of the video, if known
  97                     * resolution Textual description of width and height
  98                     * tbr        Average bitrate of audio and video in KBit/s
  99                     * abr        Average audio bitrate in KBit/s
 100                     * acodec     Name of the audio codec in use
 101                     * asr        Audio sampling rate in Hertz
 102                     * vbr        Average video bitrate in KBit/s
 103                     * fps        Frame rate
 104                     * vcodec     Name of the video codec in use
 105                     * container  Name of the container format
 106                     * filesize   The number of bytes, if known in advance
 107                     * filesize_approx  An estimate for the number of bytes
 108                     * player_url SWF Player URL (used for rtmpdump).
 109                     * protocol   The protocol that will be used for the actual
 110                                  download, lower-case.
 111                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 112                                  "m3u8", "m3u8_native" or "http_dash_segments".
 113                     * preference Order number of this format. If this field is
 114                                  present and not None, the formats get sorted
 115                                  by this field, regardless of all other values.
 116                                  -1 for default (order by other properties),
 117                                  -2 or smaller for less than default.
 118                                  < -1000 to hide the format (if there is
 119                                     another one which is strictly better)
 120                     * language   Language code, e.g. "de" or "en-US".
 121                     * language_preference  Is this in the language mentioned in
 122                                  the URL?
 123                                  10 if it's what the URL is about,
 124                                  -1 for default (don't know),
 125                                  -10 otherwise, other values reserved for now.
 126                     * quality    Order number of the video quality of this
 127                                  format, irrespective of the file format.
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                     * source_preference  Order number for this video source
 131                                   (quality takes higher priority)
 132                                  -1 for default (order by other properties),
 133                                  -2 or smaller for less than default.
 134                     * http_headers  A dictionary of additional HTTP headers
 135                                  to add to the request.
 136                     * stretched_ratio  If given and not 1, indicates that the
 137                                  video's pixels are not square.
 138                                  width : height ratio as float.
 139                     * no_resume  The server does not support resuming the
 140                                  (HTTP or RTMP) download. Boolean.
 141
 142     url:            Final video URL.
 143     ext:            Video filename extension.
 144     format:         The video format, defaults to ext (used for --get-format)
 145     player_url:     SWF Player URL (used for rtmpdump).
 146
 147     The following fields are optional:
 148
 149     alt_title:      A secondary title of the video.
 150     display_id      An alternative identifier for the video, not necessarily
 151                     unique, but available before title. Typically, id is
 152                     something like "4234987", title "Dancing naked mole rats",
 153                     and display_id "dancing-naked-mole-rats"
 154     thumbnails:     A list of dictionaries, with the following entries:
 155                         * "id" (optional, string) - Thumbnail format ID
 156                         * "url"
 157                         * "preference" (optional, int) - quality of the image
 158                         * "width" (optional, int)
 159                         * "height" (optional, int)
 160                         * "resolution" (optional, string "{width}x{height"},
 161                                         deprecated)
 162     thumbnail:      Full URL to a video thumbnail image.
 163     description:    Full video description.
 164     uploader:       Full name of the video uploader.
 165     license:        License name the video is licensed under.
 166     creator:        The main artist who created the video.
 167     release_date:   The date (YYYYMMDD) when the video was released.
 168     timestamp:      UNIX timestamp of the moment the video became available.
 169     upload_date:    Video upload date (YYYYMMDD).
 170                     If not explicitly set, calculated from timestamp.
 171     uploader_id:    Nickname or id of the video uploader.
 172     uploader_url:   Full URL to a personal webpage of the video uploader.
 173     location:       Physical location where the video was filmed.
 174     subtitles:      The available subtitles as a dictionary in the format
 175                     {language: subformats}. "subformats" is a list sorted from
 176                     lower to higher preference, each element is a dictionary
 177                     with the "ext" entry and one of:
 178                         * "data": The subtitles file contents
 179                         * "url": A URL pointing to the subtitles file
 180                     "ext" will be calculated from URL if missing
 181     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 182                     automatically generated captions
 183     duration:       Length of the video in seconds, as an integer or float.
 184     view_count:     How many users have watched the video on the platform.
 185     like_count:     Number of positive ratings of the video
 186     dislike_count:  Number of negative ratings of the video
 187     repost_count:   Number of reposts of the video
 188     average_rating: Average rating give by users, the scale used depends on the webpage
 189     comment_count:  Number of comments on the video
 190     comments:       A list of comments, each with one or more of the following
 191                     properties (all but one of text or html optional):
 192                         * "author" - human-readable name of the comment author
 193                         * "author_id" - user ID of the comment author
 194                         * "id" - Comment ID
 195                         * "html" - Comment as HTML
 196                         * "text" - Plain text of the comment
 197                         * "timestamp" - UNIX timestamp of comment
 198                         * "parent" - ID of the comment this one is replying to.
 199                                      Set to "root" to indicate that this is a
 200                                      comment to the original video.
 201     age_limit:      Age restriction for the video, as an integer (years)
 202     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 203                     should allow to get the same result again. (It will be set
 204                     by YoutubeDL if it's missing)
 205     categories:     A list of categories that the video falls in, for example
 206                     ["Sports", "Berlin"]
 207     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 208     is_live:        True, False, or None (=unknown). Whether this video is a
 209                     live stream that goes on instead of a fixed-length video.
 210     start_time:     Time in seconds where the reproduction should start, as
 211                     specified in the URL.
 212     end_time:       Time in seconds where the reproduction should end, as
 213                     specified in the URL.
 214
 215     The following fields should only be used when the video belongs to some logical
 216     chapter or section:
 217
 218     chapter:        Name or title of the chapter the video belongs to.
 219     chapter_number: Number of the chapter the video belongs to, as an integer.
 220     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 221
 222     The following fields should only be used when the video is an episode of some
 223     series or programme:
 224
 225     series:         Title of the series or programme the video episode belongs to.
 226     season:         Title of the season the video episode belongs to.
 227     season_number:  Number of the season the video episode belongs to, as an integer.
 228     season_id:      Id of the season the video episode belongs to, as a unicode string.
 229     episode:        Title of the video episode. Unlike mandatory video title field,
 230                     this field should denote the exact title of the video episode
 231                     without any kind of decoration.
 232     episode_number: Number of the video episode within a season, as an integer.
 233     episode_id:     Id of the video episode, as a unicode string.
 234
 235     The following fields should only be used when the media is a track or a part of
 236     a music album:
 237
 238     track:          Title of the track.
 239     track_number:   Number of the track within an album or a disc, as an integer.
 240     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 241                     as a unicode string.
 242     artist:         Artist(s) of the track.
 243     genre:          Genre(s) of the track.
 244     album:          Title of the album the track belongs to.
 245     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 246     album_artist:   List of all artists appeared on the album (e.g.
 247                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 248                     and compilations).
 249     disc_number:    Number of the disc or other physical medium the track belongs to,
 250                     as an integer.
 251     release_year:   Year (YYYY) when the album was released.
 252
 253     Unless mentioned otherwise, the fields should be Unicode strings.
 254
 255     Unless mentioned otherwise, None is equivalent to absence of information.
 256
 257
 258     _type "playlist" indicates multiple videos.
 259     There must be a key "entries", which is a list, an iterable, or a PagedList
 260     object, each element of which is a valid dictionary by this specification.
 261
 262     Additionally, playlists can have "title", "description" and "id" attributes
 263     with the same semantics as videos (see above).
 264
 265
 266     _type "multi_video" indicates that there are multiple videos that
 267     form a single show, for examples multiple acts of an opera or TV episode.
 268     It must have an entries key like a playlist and contain all the keys
 269     required for a video at the same time.
 270
 271
 272     _type "url" indicates that the video must be extracted from another
 273     location, possibly by a different extractor. Its only required key is:
 274     "url" - the next URL to extract.
 275     The key "ie_key" can be set to the class name (minus the trailing "IE",
 276     e.g. "Youtube") if the extractor class is known in advance.
 277     Additionally, the dictionary may have any properties of the resolved entity
 278     known in advance, for example "title" if the title of the referred video is
 279     known ahead of time.
 280
 281
 282     _type "url_transparent" entities have the same specification as "url", but
 283     indicate that the given additional information is more precise than the one
 284     associated with the resolved URL.
 285     This is useful when a site employs a video service that hosts the video and
 286     its technical metadata, but that video service does not embed a useful
 287     title, description etc.
 288
 289
 290     Subclasses of this one should re-define the _real_initialize() and
 291     _real_extract() methods and define a _VALID_URL regexp.
 292     Probably, they should also be added to the list of extractors.
 293
 294     Finally, the _WORKING attribute should be set to False for broken IEs
 295     in order to warn the users and skip the tests.
 296     """
 297
 298     _ready = False
 299     _downloader = None
 300     _WORKING = True
 301
 302     def __init__(self, downloader=None):
 303         """Constructor. Receives an optional downloader."""
 304         self._ready = False
 305         self.set_downloader(downloader)
 306
 307     @classmethod
 308     def suitable(cls, url):
 309         """Receives a URL and returns True if suitable for this IE."""
 310
 311         # This does not use has/getattr intentionally - we want to know whether
 312         # we have cached the regexp for *this* class, whereas getattr would also
 313         # match the superclass
 314         if '_VALID_URL_RE' not in cls.__dict__:
 315             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 316         return cls._VALID_URL_RE.match(url) is not None
 317
 318     @classmethod
 319     def _match_id(cls, url):
 320         if '_VALID_URL_RE' not in cls.__dict__:
 321             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 322         m = cls._VALID_URL_RE.match(url)
 323         assert m
 324         return m.group('id')
 325
 326     @classmethod
 327     def working(cls):
 328         """Getter method for _WORKING."""
 329         return cls._WORKING
 330
 331     def initialize(self):
 332         """Initializes an instance (authentication, etc)."""
 333         if not self._ready:
 334             self._real_initialize()
 335             self._ready = True
 336
 337     def extract(self, url):
 338         """Extracts URL information and returns it in list of dicts."""
 339         try:
 340             self.initialize()
 341             return self._real_extract(url)
 342         except ExtractorError:
 343             raise
 344         except compat_http_client.IncompleteRead as e:
 345             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 346         except (KeyError, StopIteration) as e:
 347             raise ExtractorError('An extractor error has occurred.', cause=e)
 348
 349     def set_downloader(self, downloader):
 350         """Sets the downloader for this IE."""
 351         self._downloader = downloader
 352
 353     def _real_initialize(self):
 354         """Real initialization process. Redefine in subclasses."""
 355         pass
 356
 357     def _real_extract(self, url):
 358         """Real extraction process. Redefine in subclasses."""
 359         pass
 360
 361     @classmethod
 362     def ie_key(cls):
 363         """A string for getting the InfoExtractor with get_info_extractor"""
 364         return compat_str(cls.__name__[:-2])
 365
 366     @property
 367     def IE_NAME(self):
 368         return compat_str(type(self).__name__[:-2])
 369
 370     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 371         """ Returns the response handle """
 372         if note is None:
 373             self.report_download_webpage(video_id)
 374         elif note is not False:
 375             if video_id is None:
 376                 self.to_screen('%s' % (note,))
 377             else:
 378                 self.to_screen('%s: %s' % (video_id, note))
 379         # data, headers and query params will be ignored for `Request` objects
 380         if isinstance(url_or_request, compat_urllib_request.Request):
 381             url_or_request = update_Request(
 382                 url_or_request, data=data, headers=headers, query=query)
 383         else:
 384             if query:
 385                 url_or_request = update_url_query(url_or_request, query)
 386             if data or headers:
 387                 url_or_request = sanitized_Request(url_or_request, data, headers)
 388         try:
 389             return self._downloader.urlopen(url_or_request)
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             if errnote is False:
 392                 return False
 393             if errnote is None:
 394                 errnote = 'Unable to download webpage'
 395
 396             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 397             if fatal:
 398                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 399             else:
 400                 self._downloader.report_warning(errmsg)
 401                 return False
 402
 403     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 404         """ Returns a tuple (page content as string, URL handle) """
 405         # Strip hashes from the URL (#1038)
 406         if isinstance(url_or_request, (compat_str, str)):
 407             url_or_request = url_or_request.partition('#')[0]
 408
 409         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 410         if urlh is False:
 411             assert not fatal
 412             return False
 413         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 414         return (content, urlh)
 415
 416     @staticmethod
 417     def _guess_encoding_from_content(content_type, webpage_bytes):
 418         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 419         if m:
 420             encoding = m.group(1)
 421         else:
 422             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 423                           webpage_bytes[:1024])
 424             if m:
 425                 encoding = m.group(1).decode('ascii')
 426             elif webpage_bytes.startswith(b'\xff\xfe'):
 427                 encoding = 'utf-16'
 428             else:
 429                 encoding = 'utf-8'
 430
 431         return encoding
 432
 433     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 434         content_type = urlh.headers.get('Content-Type', '')
 435         webpage_bytes = urlh.read()
 436         if prefix is not None:
 437             webpage_bytes = prefix + webpage_bytes
 438         if not encoding:
 439             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 440         if self._downloader.params.get('dump_intermediate_pages', False):
 441             try:
 442                 url = url_or_request.get_full_url()
 443             except AttributeError:
 444                 url = url_or_request
 445             self.to_screen('Dumping request to ' + url)
 446             dump = base64.b64encode(webpage_bytes).decode('ascii')
 447             self._downloader.to_screen(dump)
 448         if self._downloader.params.get('write_pages', False):
 449             try:
 450                 url = url_or_request.get_full_url()
 451             except AttributeError:
 452                 url = url_or_request
 453             basen = '%s_%s' % (video_id, url)
 454             if len(basen) > 240:
 455                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 456                 basen = basen[:240 - len(h)] + h
 457             raw_filename = basen + '.dump'
 458             filename = sanitize_filename(raw_filename, restricted=True)
 459             self.to_screen('Saving request to ' + filename)
 460             # Working around MAX_PATH limitation on Windows (see
 461             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 462             if compat_os_name == 'nt':
 463                 absfilepath = os.path.abspath(filename)
 464                 if len(absfilepath) > 259:
 465                     filename = '\\\\?\\' + absfilepath
 466             with open(filename, 'wb') as outf:
 467                 outf.write(webpage_bytes)
 468
 469         try:
 470             content = webpage_bytes.decode(encoding, 'replace')
 471         except LookupError:
 472             content = webpage_bytes.decode('utf-8', 'replace')
 473
 474         if ('<title>Access to this site is blocked</title>' in content and
 475                 'Websense' in content[:512]):
 476             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 477             blocked_iframe = self._html_search_regex(
 478                 r'<iframe src="([^"]+)"', content,
 479                 'Websense information URL', default=None)
 480             if blocked_iframe:
 481                 msg += ' Visit %s for more details' % blocked_iframe
 482             raise ExtractorError(msg, expected=True)
 483         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 484             msg = (
 485                 'Access to this webpage has been blocked by Indian censorship. '
 486                 'Use a VPN or proxy server (with --proxy) to route around it.')
 487             block_msg = self._html_search_regex(
 488                 r'</h1><p>(.*?)</p>',
 489                 content, 'block message', default=None)
 490             if block_msg:
 491                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 492             raise ExtractorError(msg, expected=True)
 493
 494         return content
 495
 496     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 497         """ Returns the data of the page as a string """
 498         success = False
 499         try_count = 0
 500         while success is False:
 501             try:
 502                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 503                 success = True
 504             except compat_http_client.IncompleteRead as e:
 505                 try_count += 1
 506                 if try_count >= tries:
 507                     raise e
 508                 self._sleep(timeout, video_id)
 509         if res is False:
 510             return res
 511         else:
 512             content, _ = res
 513             return content
 514
 515     def _download_xml(self, url_or_request, video_id,
 516                       note='Downloading XML', errnote='Unable to download XML',
 517                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 518         """Return the xml as an xml.etree.ElementTree.Element"""
 519         xml_string = self._download_webpage(
 520             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 521         if xml_string is False:
 522             return xml_string
 523         if transform_source:
 524             xml_string = transform_source(xml_string)
 525         return compat_etree_fromstring(xml_string.encode('utf-8'))
 526
 527     def _download_json(self, url_or_request, video_id,
 528                        note='Downloading JSON metadata',
 529                        errnote='Unable to download JSON metadata',
 530                        transform_source=None,
 531                        fatal=True, encoding=None, data=None, headers={}, query={}):
 532         json_string = self._download_webpage(
 533             url_or_request, video_id, note, errnote, fatal=fatal,
 534             encoding=encoding, data=data, headers=headers, query=query)
 535         if (not fatal) and json_string is False:
 536             return None
 537         return self._parse_json(
 538             json_string, video_id, transform_source=transform_source, fatal=fatal)
 539
 540     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 541         if transform_source:
 542             json_string = transform_source(json_string)
 543         try:
 544             return json.loads(json_string)
 545         except ValueError as ve:
 546             errmsg = '%s: Failed to parse JSON ' % video_id
 547             if fatal:
 548                 raise ExtractorError(errmsg, cause=ve)
 549             else:
 550                 self.report_warning(errmsg + str(ve))
 551
 552     def report_warning(self, msg, video_id=None):
 553         idstr = '' if video_id is None else '%s: ' % video_id
 554         self._downloader.report_warning(
 555             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 556
 557     def to_screen(self, msg):
 558         """Print msg to screen, prefixing it with '[ie_name]'"""
 559         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 560
 561     def report_extraction(self, id_or_name):
 562         """Report information extraction."""
 563         self.to_screen('%s: Extracting information' % id_or_name)
 564
 565     def report_download_webpage(self, video_id):
 566         """Report webpage download."""
 567         self.to_screen('%s: Downloading webpage' % video_id)
 568
 569     def report_age_confirmation(self):
 570         """Report attempt to confirm age."""
 571         self.to_screen('Confirming age')
 572
 573     def report_login(self):
 574         """Report attempt to log in."""
 575         self.to_screen('Logging in')
 576
 577     @staticmethod
 578     def raise_login_required(msg='This video is only available for registered users'):
 579         raise ExtractorError(
 580             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 581             expected=True)
 582
 583     @staticmethod
 584     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 585         raise ExtractorError(
 586             '%s. You might want to use --proxy to workaround.' % msg,
 587             expected=True)
 588
 589     # Methods for following #608
 590     @staticmethod
 591     def url_result(url, ie=None, video_id=None, video_title=None):
 592         """Returns a URL that points to a page that should be processed"""
 593         # TODO: ie should be the class used for getting the info
 594         video_info = {'_type': 'url',
 595                       'url': url,
 596                       'ie_key': ie}
 597         if video_id is not None:
 598             video_info['id'] = video_id
 599         if video_title is not None:
 600             video_info['title'] = video_title
 601         return video_info
 602
 603     @staticmethod
 604     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 605         """Returns a playlist"""
 606         video_info = {'_type': 'playlist',
 607                       'entries': entries}
 608         if playlist_id:
 609             video_info['id'] = playlist_id
 610         if playlist_title:
 611             video_info['title'] = playlist_title
 612         if playlist_description:
 613             video_info['description'] = playlist_description
 614         return video_info
 615
 616     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 617         """
 618         Perform a regex search on the given string, using a single or a list of
 619         patterns returning the first matching group.
 620         In case of failure return a default value or raise a WARNING or a
 621         RegexNotFoundError, depending on fatal, specifying the field name.
 622         """
 623         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 624             mobj = re.search(pattern, string, flags)
 625         else:
 626             for p in pattern:
 627                 mobj = re.search(p, string, flags)
 628                 if mobj:
 629                     break
 630
 631         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 632             _name = '\033[0;34m%s\033[0m' % name
 633         else:
 634             _name = name
 635
 636         if mobj:
 637             if group is None:
 638                 # return the first matching group
 639                 return next(g for g in mobj.groups() if g is not None)
 640             else:
 641                 return mobj.group(group)
 642         elif default is not NO_DEFAULT:
 643             return default
 644         elif fatal:
 645             raise RegexNotFoundError('Unable to extract %s' % _name)
 646         else:
 647             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 648             return None
 649
 650     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 651         """
 652         Like _search_regex, but strips HTML tags and unescapes entities.
 653         """
 654         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 655         if res:
 656             return clean_html(res).strip()
 657         else:
 658             return res
 659
 660     def _get_login_info(self):
 661         """
 662         Get the login info as (username, password)
 663         It will look in the netrc file using the _NETRC_MACHINE value
 664         If there's no info available, return (None, None)
 665         """
 666         if self._downloader is None:
 667             return (None, None)
 668
 669         username = None
 670         password = None
 671         downloader_params = self._downloader.params
 672
 673         # Attempt to use provided username and password or .netrc data
 674         if downloader_params.get('username') is not None:
 675             username = downloader_params['username']
 676             password = downloader_params['password']
 677         elif downloader_params.get('usenetrc', False):
 678             try:
 679                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 680                 if info is not None:
 681                     username = info[0]
 682                     password = info[2]
 683                 else:
 684                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 685             except (IOError, netrc.NetrcParseError) as err:
 686                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 687
 688         return (username, password)
 689
 690     def _get_tfa_info(self, note='two-factor verification code'):
 691         """
 692         Get the two-factor authentication info
 693         TODO - asking the user will be required for sms/phone verify
 694         currently just uses the command line option
 695         If there's no info available, return None
 696         """
 697         if self._downloader is None:
 698             return None
 699         downloader_params = self._downloader.params
 700
 701         if downloader_params.get('twofactor') is not None:
 702             return downloader_params['twofactor']
 703
 704         return compat_getpass('Type %s and press [Return]: ' % note)
 705
 706     # Helper functions for extracting OpenGraph info
 707     @staticmethod
 708     def _og_regexes(prop):
 709         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 710         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 711                        % {'prop': re.escape(prop)})
 712         template = r'<meta[^>]+?%s[^>]+?%s'
 713         return [
 714             template % (property_re, content_re),
 715             template % (content_re, property_re),
 716         ]
 717
 718     @staticmethod
 719     def _meta_regex(prop):
 720         return r'''(?isx)<meta
 721                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 722                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 723
 724     def _og_search_property(self, prop, html, name=None, **kargs):
 725         if name is None:
 726             name = 'OpenGraph %s' % prop
 727         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 728         if escaped is None:
 729             return None
 730         return unescapeHTML(escaped)
 731
 732     def _og_search_thumbnail(self, html, **kargs):
 733         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 734
 735     def _og_search_description(self, html, **kargs):
 736         return self._og_search_property('description', html, fatal=False, **kargs)
 737
 738     def _og_search_title(self, html, **kargs):
 739         return self._og_search_property('title', html, **kargs)
 740
 741     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 742         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 743         if secure:
 744             regexes = self._og_regexes('video:secure_url') + regexes
 745         return self._html_search_regex(regexes, html, name, **kargs)
 746
 747     def _og_search_url(self, html, **kargs):
 748         return self._og_search_property('url', html, **kargs)
 749
 750     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 751         if display_name is None:
 752             display_name = name
 753         return self._html_search_regex(
 754             self._meta_regex(name),
 755             html, display_name, fatal=fatal, group='content', **kwargs)
 756
 757     def _dc_search_uploader(self, html):
 758         return self._html_search_meta('dc.creator', html, 'uploader')
 759
 760     def _rta_search(self, html):
 761         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 762         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 763                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 764                      html):
 765             return 18
 766         return 0
 767
 768     def _media_rating_search(self, html):
 769         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 770         rating = self._html_search_meta('rating', html)
 771
 772         if not rating:
 773             return None
 774
 775         RATING_TABLE = {
 776             'safe for kids': 0,
 777             'general': 8,
 778             '14 years': 14,
 779             'mature': 17,
 780             'restricted': 19,
 781         }
 782         return RATING_TABLE.get(rating.lower())
 783
 784     def _family_friendly_search(self, html):
 785         # See http://schema.org/VideoObject
 786         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 787
 788         if not family_friendly:
 789             return None
 790
 791         RATING_TABLE = {
 792             '1': 0,
 793             'true': 0,
 794             '0': 18,
 795             'false': 18,
 796         }
 797         return RATING_TABLE.get(family_friendly.lower())
 798
 799     def _twitter_search_player(self, html):
 800         return self._html_search_meta('twitter:player', html,
 801                                       'twitter card player')
 802
 803     def _search_json_ld(self, html, video_id, **kwargs):
 804         json_ld = self._search_regex(
 805             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 806             html, 'JSON-LD', group='json_ld', **kwargs)
 807         if not json_ld:
 808             return {}
 809         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 810
 811     def _json_ld(self, json_ld, video_id, fatal=True):
 812         if isinstance(json_ld, compat_str):
 813             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 814         if not json_ld:
 815             return {}
 816         info = {}
 817         if json_ld.get('@context') == 'http://schema.org':
 818             item_type = json_ld.get('@type')
 819             if item_type == 'TVEpisode':
 820                 info.update({
 821                     'episode': unescapeHTML(json_ld.get('name')),
 822                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 823                     'description': unescapeHTML(json_ld.get('description')),
 824                 })
 825                 part_of_season = json_ld.get('partOfSeason')
 826                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 827                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 828                 part_of_series = json_ld.get('partOfSeries')
 829                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 830                     info['series'] = unescapeHTML(part_of_series.get('name'))
 831             elif item_type == 'Article':
 832                 info.update({
 833                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 834                     'title': unescapeHTML(json_ld.get('headline')),
 835                     'description': unescapeHTML(json_ld.get('articleBody')),
 836                 })
 837         return dict((k, v) for k, v in info.items() if v is not None)
 838
 839     @staticmethod
 840     def _hidden_inputs(html):
 841         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 842         hidden_inputs = {}
 843         for input in re.findall(r'(?i)<input([^>]+)>', html):
 844             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 845                 continue
 846             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 847             if not name:
 848                 continue
 849             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 850             if not value:
 851                 continue
 852             hidden_inputs[name.group('value')] = value.group('value')
 853         return hidden_inputs
 854
 855     def _form_hidden_inputs(self, form_id, html):
 856         form = self._search_regex(
 857             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 858             html, '%s form' % form_id, group='form')
 859         return self._hidden_inputs(form)
 860
 861     def _sort_formats(self, formats, field_preference=None):
 862         if not formats:
 863             raise ExtractorError('No video formats found')
 864
 865         for f in formats:
 866             # Automatically determine tbr when missing based on abr and vbr (improves
 867             # formats sorting in some cases)
 868             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 869                 f['tbr'] = f['abr'] + f['vbr']
 870
 871         def _formats_key(f):
 872             # TODO remove the following workaround
 873             from ..utils import determine_ext
 874             if not f.get('ext') and 'url' in f:
 875                 f['ext'] = determine_ext(f['url'])
 876
 877             if isinstance(field_preference, (list, tuple)):
 878                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 879
 880             preference = f.get('preference')
 881             if preference is None:
 882                 preference = 0
 883                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 884                     preference -= 0.5
 885
 886             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 887
 888             if f.get('vcodec') == 'none':  # audio only
 889                 preference -= 50
 890                 if self._downloader.params.get('prefer_free_formats'):
 891                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 892                 else:
 893                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 894                 ext_preference = 0
 895                 try:
 896                     audio_ext_preference = ORDER.index(f['ext'])
 897                 except ValueError:
 898                     audio_ext_preference = -1
 899             else:
 900                 if f.get('acodec') == 'none':  # video only
 901                     preference -= 40
 902                 if self._downloader.params.get('prefer_free_formats'):
 903                     ORDER = ['flv', 'mp4', 'webm']
 904                 else:
 905                     ORDER = ['webm', 'flv', 'mp4']
 906                 try:
 907                     ext_preference = ORDER.index(f['ext'])
 908                 except ValueError:
 909                     ext_preference = -1
 910                 audio_ext_preference = 0
 911
 912             return (
 913                 preference,
 914                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 915                 f.get('quality') if f.get('quality') is not None else -1,
 916                 f.get('tbr') if f.get('tbr') is not None else -1,
 917                 f.get('filesize') if f.get('filesize') is not None else -1,
 918                 f.get('vbr') if f.get('vbr') is not None else -1,
 919                 f.get('height') if f.get('height') is not None else -1,
 920                 f.get('width') if f.get('width') is not None else -1,
 921                 proto_preference,
 922                 ext_preference,
 923                 f.get('abr') if f.get('abr') is not None else -1,
 924                 audio_ext_preference,
 925                 f.get('fps') if f.get('fps') is not None else -1,
 926                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 927                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 928                 f.get('format_id') if f.get('format_id') is not None else '',
 929             )
 930         formats.sort(key=_formats_key)
 931
 932     def _check_formats(self, formats, video_id):
 933         if formats:
 934             formats[:] = filter(
 935                 lambda f: self._is_valid_url(
 936                     f['url'], video_id,
 937                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 938                 formats)
 939
 940     @staticmethod
 941     def _remove_duplicate_formats(formats):
 942         format_urls = set()
 943         unique_formats = []
 944         for f in formats:
 945             if f['url'] not in format_urls:
 946                 format_urls.add(f['url'])
 947                 unique_formats.append(f)
 948         formats[:] = unique_formats
 949
 950     def _is_valid_url(self, url, video_id, item='video'):
 951         url = self._proto_relative_url(url, scheme='http:')
 952         # For now assume non HTTP(S) URLs always valid
 953         if not (url.startswith('http://') or url.startswith('https://')):
 954             return True
 955         try:
 956             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 957             return True
 958         except ExtractorError as e:
 959             if isinstance(e.cause, compat_urllib_error.URLError):
 960                 self.to_screen(
 961                     '%s: %s URL is invalid, skipping' % (video_id, item))
 962                 return False
 963             raise
 964
 965     def http_scheme(self):
 966         """ Either "http:" or "https:", depending on the user's preferences """
 967         return (
 968             'http:'
 969             if self._downloader.params.get('prefer_insecure', False)
 970             else 'https:')
 971
 972     def _proto_relative_url(self, url, scheme=None):
 973         if url is None:
 974             return url
 975         if url.startswith('//'):
 976             if scheme is None:
 977                 scheme = self.http_scheme()
 978             return scheme + url
 979         else:
 980             return url
 981
 982     def _sleep(self, timeout, video_id, msg_template=None):
 983         if msg_template is None:
 984             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 985         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 986         self.to_screen(msg)
 987         time.sleep(timeout)
 988
 989     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 990                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 991                              fatal=True):
 992         manifest = self._download_xml(
 993             manifest_url, video_id, 'Downloading f4m manifest',
 994             'Unable to download f4m manifest',
 995             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 996             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 997             transform_source=transform_source,
 998             fatal=fatal)
 999
1000         if manifest is False:
1001             return []
1002
1003         return self._parse_f4m_formats(
1004             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1005             transform_source=transform_source, fatal=fatal)
1006
1007     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1008                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1009                            fatal=True):
1010         formats = []
1011         manifest_version = '1.0'
1012         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1013         if not media_nodes:
1014             manifest_version = '2.0'
1015             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1016         # Remove unsupported DRM protected media from final formats
1017         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1018         media_nodes = remove_encrypted_media(media_nodes)
1019         if not media_nodes:
1020             return formats
1021         base_url = xpath_text(
1022             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1023             'base URL', default=None)
1024         if base_url:
1025             base_url = base_url.strip()
1026         for i, media_el in enumerate(media_nodes):
1027             if manifest_version == '2.0':
1028                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1029                 if not media_url:
1030                     continue
1031                 manifest_url = (
1032                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1033                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1034                 # If media_url is itself a f4m manifest do the recursive extraction
1035                 # since bitrates in parent manifest (this one) and media_url manifest
1036                 # may differ leading to inability to resolve the format by requested
1037                 # bitrate in f4m downloader
1038                 if determine_ext(manifest_url) == 'f4m':
1039                     formats.extend(self._extract_f4m_formats(
1040                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1041                         transform_source=transform_source, fatal=fatal))
1042                     continue
1043             tbr = int_or_none(media_el.attrib.get('bitrate'))
1044             formats.append({
1045                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1046                 'url': manifest_url,
1047                 'ext': 'flv',
1048                 'tbr': tbr,
1049                 'width': int_or_none(media_el.attrib.get('width')),
1050                 'height': int_or_none(media_el.attrib.get('height')),
1051                 'preference': preference,
1052             })
1053         return formats
1054
1055     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1056                               entry_protocol='m3u8', preference=None,
1057                               m3u8_id=None, note=None, errnote=None,
1058                               fatal=True):
1059
1060         formats = [{
1061             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1062             'url': m3u8_url,
1063             'ext': ext,
1064             'protocol': 'm3u8',
1065             'preference': preference - 1 if preference else -1,
1066             'resolution': 'multiple',
1067             'format_note': 'Quality selection URL',
1068         }]
1069
1070         format_url = lambda u: (
1071             u
1072             if re.match(r'^https?://', u)
1073             else compat_urlparse.urljoin(m3u8_url, u))
1074
1075         res = self._download_webpage_handle(
1076             m3u8_url, video_id,
1077             note=note or 'Downloading m3u8 information',
1078             errnote=errnote or 'Failed to download m3u8 information',
1079             fatal=fatal)
1080         if res is False:
1081             return []
1082         m3u8_doc, urlh = res
1083         m3u8_url = urlh.geturl()
1084
1085         # We should try extracting formats only from master playlists [1], i.e.
1086         # playlists that describe available qualities. On the other hand media
1087         # playlists [2] should be returned as is since they contain just the media
1088         # without qualities renditions.
1089         # Fortunately, master playlist can be easily distinguished from media
1090         # playlist based on particular tags availability. As of [1, 2] master
1091         # playlist tags MUST NOT appear in a media playist and vice versa.
1092         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1093         # and MUST NOT appear in master playlist thus we can clearly detect media
1094         # playlist with this criterion.
1095         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1096         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1097         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1098         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1099             return [{
1100                 'url': m3u8_url,
1101                 'format_id': m3u8_id,
1102                 'ext': ext,
1103                 'protocol': entry_protocol,
1104                 'preference': preference,
1105             }]
1106         last_info = None
1107         last_media = None
1108         kv_rex = re.compile(
1109             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1110         for line in m3u8_doc.splitlines():
1111             if line.startswith('#EXT-X-STREAM-INF:'):
1112                 last_info = {}
1113                 for m in kv_rex.finditer(line):
1114                     v = m.group('val')
1115                     if v.startswith('"'):
1116                         v = v[1:-1]
1117                     last_info[m.group('key')] = v
1118             elif line.startswith('#EXT-X-MEDIA:'):
1119                 last_media = {}
1120                 for m in kv_rex.finditer(line):
1121                     v = m.group('val')
1122                     if v.startswith('"'):
1123                         v = v[1:-1]
1124                     last_media[m.group('key')] = v
1125             elif line.startswith('#') or not line.strip():
1126                 continue
1127             else:
1128                 if last_info is None:
1129                     formats.append({'url': format_url(line)})
1130                     continue
1131                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1132                 format_id = []
1133                 if m3u8_id:
1134                     format_id.append(m3u8_id)
1135                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1136                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1137                 f = {
1138                     'format_id': '-'.join(format_id),
1139                     'url': format_url(line.strip()),
1140                     'tbr': tbr,
1141                     'ext': ext,
1142                     'protocol': entry_protocol,
1143                     'preference': preference,
1144                 }
1145                 resolution = last_info.get('RESOLUTION')
1146                 if resolution:
1147                     width_str, height_str = resolution.split('x')
1148                     f['width'] = int(width_str)
1149                     f['height'] = int(height_str)
1150                 codecs = last_info.get('CODECS')
1151                 if codecs:
1152                     vcodec, acodec = [None] * 2
1153                     va_codecs = codecs.split(',')
1154                     if len(va_codecs) == 1:
1155                         # Audio only entries usually come with single codec and
1156                         # no resolution. For more robustness we also check it to
1157                         # be mp4 audio.
1158                         if not resolution and va_codecs[0].startswith('mp4a'):
1159                             vcodec, acodec = 'none', va_codecs[0]
1160                         else:
1161                             vcodec = va_codecs[0]
1162                     else:
1163                         vcodec, acodec = va_codecs[:2]
1164                     f.update({
1165                         'acodec': acodec,
1166                         'vcodec': vcodec,
1167                     })
1168                 if last_media is not None:
1169                     f['m3u8_media'] = last_media
1170                     last_media = None
1171                 formats.append(f)
1172                 last_info = {}
1173         return formats
1174
1175     @staticmethod
1176     def _xpath_ns(path, namespace=None):
1177         if not namespace:
1178             return path
1179         out = []
1180         for c in path.split('/'):
1181             if not c or c == '.':
1182                 out.append(c)
1183             else:
1184                 out.append('{%s}%s' % (namespace, c))
1185         return '/'.join(out)
1186
1187     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1188         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1189
1190         if smil is False:
1191             assert not fatal
1192             return []
1193
1194         namespace = self._parse_smil_namespace(smil)
1195
1196         return self._parse_smil_formats(
1197             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1198
1199     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1200         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1201         if smil is False:
1202             return {}
1203         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1204
1205     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1206         return self._download_xml(
1207             smil_url, video_id, 'Downloading SMIL file',
1208             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1209
1210     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1211         namespace = self._parse_smil_namespace(smil)
1212
1213         formats = self._parse_smil_formats(
1214             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1215         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1216
1217         video_id = os.path.splitext(url_basename(smil_url))[0]
1218         title = None
1219         description = None
1220         upload_date = None
1221         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1222             name = meta.attrib.get('name')
1223             content = meta.attrib.get('content')
1224             if not name or not content:
1225                 continue
1226             if not title and name == 'title':
1227                 title = content
1228             elif not description and name in ('description', 'abstract'):
1229                 description = content
1230             elif not upload_date and name == 'date':
1231                 upload_date = unified_strdate(content)
1232
1233         thumbnails = [{
1234             'id': image.get('type'),
1235             'url': image.get('src'),
1236             'width': int_or_none(image.get('width')),
1237             'height': int_or_none(image.get('height')),
1238         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1239
1240         return {
1241             'id': video_id,
1242             'title': title or video_id,
1243             'description': description,
1244             'upload_date': upload_date,
1245             'thumbnails': thumbnails,
1246             'formats': formats,
1247             'subtitles': subtitles,
1248         }
1249
1250     def _parse_smil_namespace(self, smil):
1251         return self._search_regex(
1252             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1253
1254     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1255         base = smil_url
1256         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1257             b = meta.get('base') or meta.get('httpBase')
1258             if b:
1259                 base = b
1260                 break
1261
1262         formats = []
1263         rtmp_count = 0
1264         http_count = 0
1265         m3u8_count = 0
1266
1267         srcs = []
1268         videos = smil.findall(self._xpath_ns('.//video', namespace))
1269         for video in videos:
1270             src = video.get('src')
1271             if not src or src in srcs:
1272                 continue
1273             srcs.append(src)
1274
1275             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1276             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1277             width = int_or_none(video.get('width'))
1278             height = int_or_none(video.get('height'))
1279             proto = video.get('proto')
1280             ext = video.get('ext')
1281             src_ext = determine_ext(src)
1282             streamer = video.get('streamer') or base
1283
1284             if proto == 'rtmp' or streamer.startswith('rtmp'):
1285                 rtmp_count += 1
1286                 formats.append({
1287                     'url': streamer,
1288                     'play_path': src,
1289                     'ext': 'flv',
1290                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1291                     'tbr': bitrate,
1292                     'filesize': filesize,
1293                     'width': width,
1294                     'height': height,
1295                 })
1296                 if transform_rtmp_url:
1297                     streamer, src = transform_rtmp_url(streamer, src)
1298                     formats[-1].update({
1299                         'url': streamer,
1300                         'play_path': src,
1301                     })
1302                 continue
1303
1304             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1305             src_url = src_url.strip()
1306
1307             if proto == 'm3u8' or src_ext == 'm3u8':
1308                 m3u8_formats = self._extract_m3u8_formats(
1309                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1310                 if len(m3u8_formats) == 1:
1311                     m3u8_count += 1
1312                     m3u8_formats[0].update({
1313                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1314                         'tbr': bitrate,
1315                         'width': width,
1316                         'height': height,
1317                     })
1318                 formats.extend(m3u8_formats)
1319                 continue
1320
1321             if src_ext == 'f4m':
1322                 f4m_url = src_url
1323                 if not f4m_params:
1324                     f4m_params = {
1325                         'hdcore': '3.2.0',
1326                         'plugin': 'flowplayer-3.2.0.1',
1327                     }
1328                 f4m_url += '&' if '?' in f4m_url else '?'
1329                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1330                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1331                 continue
1332
1333             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1334                 http_count += 1
1335                 formats.append({
1336                     'url': src_url,
1337                     'ext': ext or src_ext or 'flv',
1338                     'format_id': 'http-%d' % (bitrate or http_count),
1339                     'tbr': bitrate,
1340                     'filesize': filesize,
1341                     'width': width,
1342                     'height': height,
1343                 })
1344                 continue
1345
1346         return formats
1347
1348     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1349         urls = []
1350         subtitles = {}
1351         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1352             src = textstream.get('src')
1353             if not src or src in urls:
1354                 continue
1355             urls.append(src)
1356             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1357             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1358             subtitles.setdefault(lang, []).append({
1359                 'url': src,
1360                 'ext': ext,
1361             })
1362         return subtitles
1363
1364     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1365         xspf = self._download_xml(
1366             playlist_url, playlist_id, 'Downloading xpsf playlist',
1367             'Unable to download xspf manifest', fatal=fatal)
1368         if xspf is False:
1369             return []
1370         return self._parse_xspf(xspf, playlist_id)
1371
1372     def _parse_xspf(self, playlist, playlist_id):
1373         NS_MAP = {
1374             'xspf': 'http://xspf.org/ns/0/',
1375             's1': 'http://static.streamone.nl/player/ns/0',
1376         }
1377
1378         entries = []
1379         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1380             title = xpath_text(
1381                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1382             description = xpath_text(
1383                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1384             thumbnail = xpath_text(
1385                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1386             duration = float_or_none(
1387                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1388
1389             formats = [{
1390                 'url': location.text,
1391                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1392                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1393                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1394             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1395             self._sort_formats(formats)
1396
1397             entries.append({
1398                 'id': playlist_id,
1399                 'title': title,
1400                 'description': description,
1401                 'thumbnail': thumbnail,
1402                 'duration': duration,
1403                 'formats': formats,
1404             })
1405         return entries
1406
1407     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1408         res = self._download_webpage_handle(
1409             mpd_url, video_id,
1410             note=note or 'Downloading MPD manifest',
1411             errnote=errnote or 'Failed to download MPD manifest',
1412             fatal=fatal)
1413         if res is False:
1414             return []
1415         mpd, urlh = res
1416         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1417
1418         return self._parse_mpd_formats(
1419             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1420
1421     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1422         if mpd_doc.get('type') == 'dynamic':
1423             return []
1424
1425         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1426
1427         def _add_ns(path):
1428             return self._xpath_ns(path, namespace)
1429
1430         def is_drm_protected(element):
1431             return element.find(_add_ns('ContentProtection')) is not None
1432
1433         def extract_multisegment_info(element, ms_parent_info):
1434             ms_info = ms_parent_info.copy()
1435             segment_list = element.find(_add_ns('SegmentList'))
1436             if segment_list is not None:
1437                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1438                 if segment_urls_e:
1439                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1440                 initialization = segment_list.find(_add_ns('Initialization'))
1441                 if initialization is not None:
1442                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1443             else:
1444                 segment_template = element.find(_add_ns('SegmentTemplate'))
1445                 if segment_template is not None:
1446                     start_number = segment_template.get('startNumber')
1447                     if start_number:
1448                         ms_info['start_number'] = int(start_number)
1449                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1450                     if segment_timeline is not None:
1451                         s_e = segment_timeline.findall(_add_ns('S'))
1452                         if s_e:
1453                             ms_info['total_number'] = 0
1454                             for s in s_e:
1455                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1456                     else:
1457                         timescale = segment_template.get('timescale')
1458                         if timescale:
1459                             ms_info['timescale'] = int(timescale)
1460                         segment_duration = segment_template.get('duration')
1461                         if segment_duration:
1462                             ms_info['segment_duration'] = int(segment_duration)
1463                     media_template = segment_template.get('media')
1464                     if media_template:
1465                         ms_info['media_template'] = media_template
1466                     initialization = segment_template.get('initialization')
1467                     if initialization:
1468                         ms_info['initialization_url'] = initialization
1469                     else:
1470                         initialization = segment_template.find(_add_ns('Initialization'))
1471                         if initialization is not None:
1472                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1473             return ms_info
1474
1475         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1476         formats = []
1477         for period in mpd_doc.findall(_add_ns('Period')):
1478             period_duration = parse_duration(period.get('duration')) or mpd_duration
1479             period_ms_info = extract_multisegment_info(period, {
1480                 'start_number': 1,
1481                 'timescale': 1,
1482             })
1483             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1484                 if is_drm_protected(adaptation_set):
1485                     continue
1486                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1487                 for representation in adaptation_set.findall(_add_ns('Representation')):
1488                     if is_drm_protected(representation):
1489                         continue
1490                     representation_attrib = adaptation_set.attrib.copy()
1491                     representation_attrib.update(representation.attrib)
1492                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1493                     mime_type = representation_attrib['mimeType']
1494                     content_type = mime_type.split('/')[0]
1495                     if content_type == 'text':
1496                         # TODO implement WebVTT downloading
1497                         pass
1498                     elif content_type == 'video' or content_type == 'audio':
1499                         base_url = ''
1500                         for element in (representation, adaptation_set, period, mpd_doc):
1501                             base_url_e = element.find(_add_ns('BaseURL'))
1502                             if base_url_e is not None:
1503                                 base_url = base_url_e.text + base_url
1504                                 if re.match(r'^https?://', base_url):
1505                                     break
1506                         if mpd_base_url and not re.match(r'^https?://', base_url):
1507                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1508                                 mpd_base_url += '/'
1509                             base_url = mpd_base_url + base_url
1510                         representation_id = representation_attrib.get('id')
1511                         lang = representation_attrib.get('lang')
1512                         url_el = representation.find(_add_ns('BaseURL'))
1513                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1514                         f = {
1515                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1516                             'url': base_url,
1517                             'ext': mimetype2ext(mime_type),
1518                             'width': int_or_none(representation_attrib.get('width')),
1519                             'height': int_or_none(representation_attrib.get('height')),
1520                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1521                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1522                             'fps': int_or_none(representation_attrib.get('frameRate')),
1523                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1524                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1525                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1526                             'format_note': 'DASH %s' % content_type,
1527                             'filesize': filesize,
1528                         }
1529                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1530                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1531                             if 'total_number' not in representation_ms_info and 'segment_duration':
1532                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1533                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1534                             media_template = representation_ms_info['media_template']
1535                             media_template = media_template.replace('$RepresentationID$', representation_id)
1536                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
1537                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1538                             media_template.replace('$$', '$')
1539                             representation_ms_info['segment_urls'] = [
1540                                 media_template % {
1541                                     'Number': segment_number,
1542                                     'Bandwidth': representation_attrib.get('bandwidth')}
1543                                 for segment_number in range(
1544                                     representation_ms_info['start_number'],
1545                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1546                         if 'segment_urls' in representation_ms_info:
1547                             f.update({
1548                                 'segment_urls': representation_ms_info['segment_urls'],
1549                                 'protocol': 'http_dash_segments',
1550                             })
1551                             if 'initialization_url' in representation_ms_info:
1552                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1553                                 f.update({
1554                                     'initialization_url': initialization_url,
1555                                 })
1556                                 if not f.get('url'):
1557                                     f['url'] = initialization_url
1558                         try:
1559                             existing_format = next(
1560                                 fo for fo in formats
1561                                 if fo['format_id'] == representation_id)
1562                         except StopIteration:
1563                             full_info = formats_dict.get(representation_id, {}).copy()
1564                             full_info.update(f)
1565                             formats.append(full_info)
1566                         else:
1567                             existing_format.update(f)
1568                     else:
1569                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1570         return formats
1571
1572     def _live_title(self, name):
1573         """ Generate the title for a live video """
1574         now = datetime.datetime.now()
1575         now_str = now.strftime('%Y-%m-%d %H:%M')
1576         return name + ' ' + now_str
1577
1578     def _int(self, v, name, fatal=False, **kwargs):
1579         res = int_or_none(v, **kwargs)
1580         if 'get_attr' in kwargs:
1581             print(getattr(v, kwargs['get_attr']))
1582         if res is None:
1583             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1584             if fatal:
1585                 raise ExtractorError(msg)
1586             else:
1587                 self._downloader.report_warning(msg)
1588         return res
1589
1590     def _float(self, v, name, fatal=False, **kwargs):
1591         res = float_or_none(v, **kwargs)
1592         if res is None:
1593             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1594             if fatal:
1595                 raise ExtractorError(msg)
1596             else:
1597                 self._downloader.report_warning(msg)
1598         return res
1599
1600     def _set_cookie(self, domain, name, value, expire_time=None):
1601         cookie = compat_cookiejar.Cookie(
1602             0, name, value, None, None, domain, None,
1603             None, '/', True, False, expire_time, '', None, None, None)
1604         self._downloader.cookiejar.set_cookie(cookie)
1605
1606     def _get_cookies(self, url):
1607         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1608         req = sanitized_Request(url)
1609         self._downloader.cookiejar.add_cookie_header(req)
1610         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1611
1612     def get_testcases(self, include_onlymatching=False):
1613         t = getattr(self, '_TEST', None)
1614         if t:
1615             assert not hasattr(self, '_TESTS'), \
1616                 '%s has _TEST and _TESTS' % type(self).__name__
1617             tests = [t]
1618         else:
1619             tests = getattr(self, '_TESTS', [])
1620         for t in tests:
1621             if not include_onlymatching and t.get('only_matching', False):
1622                 continue
1623             t['name'] = type(self).__name__[:-len('IE')]
1624             yield t
1625
1626     def is_suitable(self, age_limit):
1627         """ Test whether the extractor is generally suitable for the given
1628         age limit (i.e. pornographic sites are not, all others usually are) """
1629
1630         any_restricted = False
1631         for tc in self.get_testcases(include_onlymatching=False):
1632             if 'playlist' in tc:
1633                 tc = tc['playlist'][0]
1634             is_restricted = age_restricted(
1635                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1636             if not is_restricted:
1637                 return True
1638             any_restricted = any_restricted or is_restricted
1639         return not any_restricted
1640
1641     def extract_subtitles(self, *args, **kwargs):
1642         if (self._downloader.params.get('writesubtitles', False) or
1643                 self._downloader.params.get('listsubtitles')):
1644             return self._get_subtitles(*args, **kwargs)
1645         return {}
1646
1647     def _get_subtitles(self, *args, **kwargs):
1648         raise NotImplementedError('This method must be implemented by subclasses')
1649
1650     @staticmethod
1651     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1652         """ Merge subtitle items for one language. Items with duplicated URLs
1653         will be dropped. """
1654         list1_urls = set([item['url'] for item in subtitle_list1])
1655         ret = list(subtitle_list1)
1656         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1657         return ret
1658
1659     @classmethod
1660     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1661         """ Merge two subtitle dictionaries, language by language. """
1662         ret = dict(subtitle_dict1)
1663         for lang in subtitle_dict2:
1664             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1665         return ret
1666
1667     def extract_automatic_captions(self, *args, **kwargs):
1668         if (self._downloader.params.get('writeautomaticsub', False) or
1669                 self._downloader.params.get('listsubtitles')):
1670             return self._get_automatic_captions(*args, **kwargs)
1671         return {}
1672
1673     def _get_automatic_captions(self, *args, **kwargs):
1674         raise NotImplementedError('This method must be implemented by subclasses')
1675
1676     def mark_watched(self, *args, **kwargs):
1677         if (self._downloader.params.get('mark_watched', False) and
1678                 (self._get_login_info()[0] is not None or
1679                     self._downloader.params.get('cookiefile') is not None)):
1680             self._mark_watched(*args, **kwargs)
1681
1682     def _mark_watched(self, *args, **kwargs):
1683         raise NotImplementedError('This method must be implemented by subclasses')
1684
1685
1686 class SearchInfoExtractor(InfoExtractor):
1687     """
1688     Base class for paged search queries extractors.
1689     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1690     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1691     """
1692
1693     @classmethod
1694     def _make_valid_url(cls):
1695         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1696
1697     @classmethod
1698     def suitable(cls, url):
1699         return re.match(cls._make_valid_url(), url) is not None
1700
1701     def _real_extract(self, query):
1702         mobj = re.match(self._make_valid_url(), query)
1703         if mobj is None:
1704             raise ExtractorError('Invalid search query "%s"' % query)
1705
1706         prefix = mobj.group('prefix')
1707         query = mobj.group('query')
1708         if prefix == '':
1709             return self._get_n_results(query, 1)
1710         elif prefix == 'all':
1711             return self._get_n_results(query, self._MAX_RESULTS)
1712         else:
1713             n = int(prefix)
1714             if n <= 0:
1715                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1716             elif n > self._MAX_RESULTS:
1717                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1718                 n = self._MAX_RESULTS
1719             return self._get_n_results(query, n)
1720
1721     def _get_n_results(self, query, n):
1722         """Get a specified number of results for a query"""
1723         raise NotImplementedError('This method must be implemented by subclasses')
1724
1725     @property
1726     def SEARCH_KEY(self):
1727         return self._SEARCH_KEY