_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177
 178     url:            Final video URL.
 179     ext:            Video filename extension.
 180     format:         The video format, defaults to ext (used for --get-format)
 181     player_url:     SWF Player URL (used for rtmpdump).
 182
 183     The following fields are optional:
 184
 185     alt_title:      A secondary title of the video.
 186     display_id      An alternative identifier for the video, not necessarily
 187                     unique, but available before title. Typically, id is
 188                     something like "4234987", title "Dancing naked mole rats",
 189                     and display_id "dancing-naked-mole-rats"
 190     thumbnails:     A list of dictionaries, with the following entries:
 191                         * "id" (optional, string) - Thumbnail format ID
 192                         * "url"
 193                         * "preference" (optional, int) - quality of the image
 194                         * "width" (optional, int)
 195                         * "height" (optional, int)
 196                         * "resolution" (optional, string "{width}x{height"},
 197                                         deprecated)
 198                         * "filesize" (optional, int)
 199     thumbnail:      Full URL to a video thumbnail image.
 200     description:    Full video description.
 201     uploader:       Full name of the video uploader.
 202     license:        License name the video is licensed under.
 203     creator:        The creator of the video.
 204     release_date:   The date (YYYYMMDD) when the video was released.
 205     timestamp:      UNIX timestamp of the moment the video became available.
 206     upload_date:    Video upload date (YYYYMMDD).
 207                     If not explicitly set, calculated from timestamp.
 208     uploader_id:    Nickname or id of the video uploader.
 209     uploader_url:   Full URL to a personal webpage of the video uploader.
 210     location:       Physical location where the video was filmed.
 211     subtitles:      The available subtitles as a dictionary in the format
 212                     {tag: subformats}. "tag" is usually a language code, and
 213                     "subformats" is a list sorted from lower to higher
 214                     preference, each element is a dictionary with the "ext"
 215                     entry and one of:
 216                         * "data": The subtitles file contents
 217                         * "url": A URL pointing to the subtitles file
 218                     "ext" will be calculated from URL if missing
 219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 220                     automatically generated captions
 221     duration:       Length of the video in seconds, as an integer or float.
 222     view_count:     How many users have watched the video on the platform.
 223     like_count:     Number of positive ratings of the video
 224     dislike_count:  Number of negative ratings of the video
 225     repost_count:   Number of reposts of the video
 226     average_rating: Average rating give by users, the scale used depends on the webpage
 227     comment_count:  Number of comments on the video
 228     comments:       A list of comments, each with one or more of the following
 229                     properties (all but one of text or html optional):
 230                         * "author" - human-readable name of the comment author
 231                         * "author_id" - user ID of the comment author
 232                         * "id" - Comment ID
 233                         * "html" - Comment as HTML
 234                         * "text" - Plain text of the comment
 235                         * "timestamp" - UNIX timestamp of comment
 236                         * "parent" - ID of the comment this one is replying to.
 237                                      Set to "root" to indicate that this is a
 238                                      comment to the original video.
 239     age_limit:      Age restriction for the video, as an integer (years)
 240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 241                     should allow to get the same result again. (It will be set
 242                     by YoutubeDL if it's missing)
 243     categories:     A list of categories that the video falls in, for example
 244                     ["Sports", "Berlin"]
 245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 246     is_live:        True, False, or None (=unknown). Whether this video is a
 247                     live stream that goes on instead of a fixed-length video.
 248     start_time:     Time in seconds where the reproduction should start, as
 249                     specified in the URL.
 250     end_time:       Time in seconds where the reproduction should end, as
 251                     specified in the URL.
 252     chapters:       A list of dictionaries, with the following entries:
 253                         * "start_time" - The start time of the chapter in seconds
 254                         * "end_time" - The end time of the chapter in seconds
 255                         * "title" (optional, string)
 256
 257     The following fields should only be used when the video belongs to some logical
 258     chapter or section:
 259
 260     chapter:        Name or title of the chapter the video belongs to.
 261     chapter_number: Number of the chapter the video belongs to, as an integer.
 262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 263
 264     The following fields should only be used when the video is an episode of some
 265     series, programme or podcast:
 266
 267     series:         Title of the series or programme the video episode belongs to.
 268     season:         Title of the season the video episode belongs to.
 269     season_number:  Number of the season the video episode belongs to, as an integer.
 270     season_id:      Id of the season the video episode belongs to, as a unicode string.
 271     episode:        Title of the video episode. Unlike mandatory video title field,
 272                     this field should denote the exact title of the video episode
 273                     without any kind of decoration.
 274     episode_number: Number of the video episode within a season, as an integer.
 275     episode_id:     Id of the video episode, as a unicode string.
 276
 277     The following fields should only be used when the media is a track or a part of
 278     a music album:
 279
 280     track:          Title of the track.
 281     track_number:   Number of the track within an album or a disc, as an integer.
 282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 283                     as a unicode string.
 284     artist:         Artist(s) of the track.
 285     genre:          Genre(s) of the track.
 286     album:          Title of the album the track belongs to.
 287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 288     album_artist:   List of all artists appeared on the album (e.g.
 289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 290                     and compilations).
 291     disc_number:    Number of the disc or other physical medium the track belongs to,
 292                     as an integer.
 293     release_year:   Year (YYYY) when the album was released.
 294
 295     Unless mentioned otherwise, the fields should be Unicode strings.
 296
 297     Unless mentioned otherwise, None is equivalent to absence of information.
 298
 299
 300     _type "playlist" indicates multiple videos.
 301     There must be a key "entries", which is a list, an iterable, or a PagedList
 302     object, each element of which is a valid dictionary by this specification.
 303
 304     Additionally, playlists can have "title", "description" and "id" attributes
 305     with the same semantics as videos (see above).
 306
 307
 308     _type "multi_video" indicates that there are multiple videos that
 309     form a single show, for examples multiple acts of an opera or TV episode.
 310     It must have an entries key like a playlist and contain all the keys
 311     required for a video at the same time.
 312
 313
 314     _type "url" indicates that the video must be extracted from another
 315     location, possibly by a different extractor. Its only required key is:
 316     "url" - the next URL to extract.
 317     The key "ie_key" can be set to the class name (minus the trailing "IE",
 318     e.g. "Youtube") if the extractor class is known in advance.
 319     Additionally, the dictionary may have any properties of the resolved entity
 320     known in advance, for example "title" if the title of the referred video is
 321     known ahead of time.
 322
 323
 324     _type "url_transparent" entities have the same specification as "url", but
 325     indicate that the given additional information is more precise than the one
 326     associated with the resolved URL.
 327     This is useful when a site employs a video service that hosts the video and
 328     its technical metadata, but that video service does not embed a useful
 329     title, description etc.
 330
 331
 332     Subclasses of this one should re-define the _real_initialize() and
 333     _real_extract() methods and define a _VALID_URL regexp.
 334     Probably, they should also be added to the list of extractors.
 335
 336     _GEO_BYPASS attribute may be set to False in order to disable
 337     geo restriction bypass mechanisms for a particular extractor.
 338     Though it won't disable explicit geo restriction bypass based on
 339     country code provided with geo_bypass_country. (experimental)
 340
 341     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 342     countries for this extractor. One of these countries will be used by
 343     geo restriction bypass mechanism right away in order to bypass
 344     geo restriction, of course, if the mechanism is not disabled. (experimental)
 345
 346     NB: both these geo attributes are experimental and may change in future
 347     or be completely removed.
 348
 349     Finally, the _WORKING attribute should be set to False for broken IEs
 350     in order to warn the users and skip the tests.
 351     """
 352
 353     _ready = False
 354     _downloader = None
 355     _x_forwarded_for_ip = None
 356     _GEO_BYPASS = True
 357     _GEO_COUNTRIES = None
 358     _WORKING = True
 359
 360     def __init__(self, downloader=None):
 361         """Constructor. Receives an optional downloader."""
 362         self._ready = False
 363         self._x_forwarded_for_ip = None
 364         self.set_downloader(downloader)
 365
 366     @classmethod
 367     def suitable(cls, url):
 368         """Receives a URL and returns True if suitable for this IE."""
 369
 370         # This does not use has/getattr intentionally - we want to know whether
 371         # we have cached the regexp for *this* class, whereas getattr would also
 372         # match the superclass
 373         if '_VALID_URL_RE' not in cls.__dict__:
 374             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 375         return cls._VALID_URL_RE.match(url) is not None
 376
 377     @classmethod
 378     def _match_id(cls, url):
 379         if '_VALID_URL_RE' not in cls.__dict__:
 380             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 381         m = cls._VALID_URL_RE.match(url)
 382         assert m
 383         return compat_str(m.group('id'))
 384
 385     @classmethod
 386     def working(cls):
 387         """Getter method for _WORKING."""
 388         return cls._WORKING
 389
 390     def initialize(self):
 391         """Initializes an instance (authentication, etc)."""
 392         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 393         if not self._ready:
 394             self._real_initialize()
 395             self._ready = True
 396
 397     def _initialize_geo_bypass(self, countries):
 398         """
 399         Initialize geo restriction bypass mechanism.
 400
 401         This method is used to initialize geo bypass mechanism based on faking
 402         X-Forwarded-For HTTP header. A random country from provided country list
 403         is selected and a random IP belonging to this country is generated. This
 404         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 405         HTTP requests.
 406
 407         This method will be used for initial geo bypass mechanism initialization
 408         during the instance initialization with _GEO_COUNTRIES.
 409
 410         You may also manually call it from extractor's code if geo countries
 411         information is not available beforehand (e.g. obtained during
 412         extraction) or due to some another reason.
 413         """
 414         if not self._x_forwarded_for_ip:
 415             country_code = self._downloader.params.get('geo_bypass_country', None)
 416             # If there is no explicit country for geo bypass specified and
 417             # the extractor is known to be geo restricted let's fake IP
 418             # as X-Forwarded-For right away.
 419             if (not country_code and
 420                     self._GEO_BYPASS and
 421                     self._downloader.params.get('geo_bypass', True) and
 422                     countries):
 423                 country_code = random.choice(countries)
 424             if country_code:
 425                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 426                 if self._downloader.params.get('verbose', False):
 427                     self._downloader.to_screen(
 428                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 429                         % (self._x_forwarded_for_ip, country_code.upper()))
 430
 431     def extract(self, url):
 432         """Extracts URL information and returns it in list of dicts."""
 433         try:
 434             for _ in range(2):
 435                 try:
 436                     self.initialize()
 437                     ie_result = self._real_extract(url)
 438                     if self._x_forwarded_for_ip:
 439                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 440                     return ie_result
 441                 except GeoRestrictedError as e:
 442                     if self.__maybe_fake_ip_and_retry(e.countries):
 443                         continue
 444                     raise
 445         except ExtractorError:
 446             raise
 447         except compat_http_client.IncompleteRead as e:
 448             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 449         except (KeyError, StopIteration) as e:
 450             raise ExtractorError('An extractor error has occurred.', cause=e)
 451
 452     def __maybe_fake_ip_and_retry(self, countries):
 453         if (not self._downloader.params.get('geo_bypass_country', None) and
 454                 self._GEO_BYPASS and
 455                 self._downloader.params.get('geo_bypass', True) and
 456                 not self._x_forwarded_for_ip and
 457                 countries):
 458             country_code = random.choice(countries)
 459             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 460             if self._x_forwarded_for_ip:
 461                 self.report_warning(
 462                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 463                     % (self._x_forwarded_for_ip, country_code.upper()))
 464                 return True
 465         return False
 466
 467     def set_downloader(self, downloader):
 468         """Sets the downloader for this IE."""
 469         self._downloader = downloader
 470
 471     def _real_initialize(self):
 472         """Real initialization process. Redefine in subclasses."""
 473         pass
 474
 475     def _real_extract(self, url):
 476         """Real extraction process. Redefine in subclasses."""
 477         pass
 478
 479     @classmethod
 480     def ie_key(cls):
 481         """A string for getting the InfoExtractor with get_info_extractor"""
 482         return compat_str(cls.__name__[:-2])
 483
 484     @property
 485     def IE_NAME(self):
 486         return compat_str(type(self).__name__[:-2])
 487
 488     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 489         """ Returns the response handle """
 490         if note is None:
 491             self.report_download_webpage(video_id)
 492         elif note is not False:
 493             if video_id is None:
 494                 self.to_screen('%s' % (note,))
 495             else:
 496                 self.to_screen('%s: %s' % (video_id, note))
 497         if isinstance(url_or_request, compat_urllib_request.Request):
 498             url_or_request = update_Request(
 499                 url_or_request, data=data, headers=headers, query=query)
 500         else:
 501             if query:
 502                 url_or_request = update_url_query(url_or_request, query)
 503             if data is not None or headers:
 504                 url_or_request = sanitized_Request(url_or_request, data, headers)
 505         try:
 506             return self._downloader.urlopen(url_or_request)
 507         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 508             if errnote is False:
 509                 return False
 510             if errnote is None:
 511                 errnote = 'Unable to download webpage'
 512
 513             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 514             if fatal:
 515                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 516             else:
 517                 self._downloader.report_warning(errmsg)
 518                 return False
 519
 520     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 521         """ Returns a tuple (page content as string, URL handle) """
 522         # Strip hashes from the URL (#1038)
 523         if isinstance(url_or_request, (compat_str, str)):
 524             url_or_request = url_or_request.partition('#')[0]
 525
 526         # Some sites check X-Forwarded-For HTTP header in order to figure out
 527         # the origin of the client behind proxy. This allows bypassing geo
 528         # restriction by faking this header's value to IP that belongs to some
 529         # geo unrestricted country. We will do so once we encounter any
 530         # geo restriction error.
 531         if self._x_forwarded_for_ip:
 532             if 'X-Forwarded-For' not in headers:
 533                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 534
 535         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 536         if urlh is False:
 537             assert not fatal
 538             return False
 539         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 540         return (content, urlh)
 541
 542     @staticmethod
 543     def _guess_encoding_from_content(content_type, webpage_bytes):
 544         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 545         if m:
 546             encoding = m.group(1)
 547         else:
 548             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 549                           webpage_bytes[:1024])
 550             if m:
 551                 encoding = m.group(1).decode('ascii')
 552             elif webpage_bytes.startswith(b'\xff\xfe'):
 553                 encoding = 'utf-16'
 554             else:
 555                 encoding = 'utf-8'
 556
 557         return encoding
 558
 559     def __check_blocked(self, content):
 560         first_block = content[:512]
 561         if ('<title>Access to this site is blocked</title>' in content and
 562                 'Websense' in first_block):
 563             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 564             blocked_iframe = self._html_search_regex(
 565                 r'<iframe src="([^"]+)"', content,
 566                 'Websense information URL', default=None)
 567             if blocked_iframe:
 568                 msg += ' Visit %s for more details' % blocked_iframe
 569             raise ExtractorError(msg, expected=True)
 570         if '<title>The URL you requested has been blocked</title>' in first_block:
 571             msg = (
 572                 'Access to this webpage has been blocked by Indian censorship. '
 573                 'Use a VPN or proxy server (with --proxy) to route around it.')
 574             block_msg = self._html_search_regex(
 575                 r'</h1><p>(.*?)</p>',
 576                 content, 'block message', default=None)
 577             if block_msg:
 578                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 579             raise ExtractorError(msg, expected=True)
 580         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 581                 'blocklist.rkn.gov.ru' in content):
 582             raise ExtractorError(
 583                 'Access to this webpage has been blocked by decision of the Russian government. '
 584                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 585                 expected=True)
 586
 587     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 588         content_type = urlh.headers.get('Content-Type', '')
 589         webpage_bytes = urlh.read()
 590         if prefix is not None:
 591             webpage_bytes = prefix + webpage_bytes
 592         if not encoding:
 593             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 594         if self._downloader.params.get('dump_intermediate_pages', False):
 595             try:
 596                 url = url_or_request.get_full_url()
 597             except AttributeError:
 598                 url = url_or_request
 599             self.to_screen('Dumping request to ' + url)
 600             dump = base64.b64encode(webpage_bytes).decode('ascii')
 601             self._downloader.to_screen(dump)
 602         if self._downloader.params.get('write_pages', False):
 603             try:
 604                 url = url_or_request.get_full_url()
 605             except AttributeError:
 606                 url = url_or_request
 607             basen = '%s_%s' % (video_id, url)
 608             if len(basen) > 240:
 609                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 610                 basen = basen[:240 - len(h)] + h
 611             raw_filename = basen + '.dump'
 612             filename = sanitize_filename(raw_filename, restricted=True)
 613             self.to_screen('Saving request to ' + filename)
 614             # Working around MAX_PATH limitation on Windows (see
 615             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 616             if compat_os_name == 'nt':
 617                 absfilepath = os.path.abspath(filename)
 618                 if len(absfilepath) > 259:
 619                     filename = '\\\\?\\' + absfilepath
 620             with open(filename, 'wb') as outf:
 621                 outf.write(webpage_bytes)
 622
 623         try:
 624             content = webpage_bytes.decode(encoding, 'replace')
 625         except LookupError:
 626             content = webpage_bytes.decode('utf-8', 'replace')
 627
 628         self.__check_blocked(content)
 629
 630         return content
 631
 632     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 633         """ Returns the data of the page as a string """
 634         success = False
 635         try_count = 0
 636         while success is False:
 637             try:
 638                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 639                 success = True
 640             except compat_http_client.IncompleteRead as e:
 641                 try_count += 1
 642                 if try_count >= tries:
 643                     raise e
 644                 self._sleep(timeout, video_id)
 645         if res is False:
 646             return res
 647         else:
 648             content, _ = res
 649             return content
 650
 651     def _download_xml(self, url_or_request, video_id,
 652                       note='Downloading XML', errnote='Unable to download XML',
 653                       transform_source=None, fatal=True, encoding=None,
 654                       data=None, headers={}, query={}):
 655         """Return the xml as an xml.etree.ElementTree.Element"""
 656         xml_string = self._download_webpage(
 657             url_or_request, video_id, note, errnote, fatal=fatal,
 658             encoding=encoding, data=data, headers=headers, query=query)
 659         if xml_string is False:
 660             return xml_string
 661         return self._parse_xml(
 662             xml_string, video_id, transform_source=transform_source,
 663             fatal=fatal)
 664
 665     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 666         if transform_source:
 667             xml_string = transform_source(xml_string)
 668         try:
 669             return compat_etree_fromstring(xml_string.encode('utf-8'))
 670         except compat_xml_parse_error as ve:
 671             errmsg = '%s: Failed to parse XML ' % video_id
 672             if fatal:
 673                 raise ExtractorError(errmsg, cause=ve)
 674             else:
 675                 self.report_warning(errmsg + str(ve))
 676
 677     def _download_json(self, url_or_request, video_id,
 678                        note='Downloading JSON metadata',
 679                        errnote='Unable to download JSON metadata',
 680                        transform_source=None,
 681                        fatal=True, encoding=None, data=None, headers={}, query={}):
 682         json_string = self._download_webpage(
 683             url_or_request, video_id, note, errnote, fatal=fatal,
 684             encoding=encoding, data=data, headers=headers, query=query)
 685         if (not fatal) and json_string is False:
 686             return None
 687         return self._parse_json(
 688             json_string, video_id, transform_source=transform_source, fatal=fatal)
 689
 690     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 691         if transform_source:
 692             json_string = transform_source(json_string)
 693         try:
 694             return json.loads(json_string)
 695         except ValueError as ve:
 696             errmsg = '%s: Failed to parse JSON ' % video_id
 697             if fatal:
 698                 raise ExtractorError(errmsg, cause=ve)
 699             else:
 700                 self.report_warning(errmsg + str(ve))
 701
 702     def report_warning(self, msg, video_id=None):
 703         idstr = '' if video_id is None else '%s: ' % video_id
 704         self._downloader.report_warning(
 705             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 706
 707     def to_screen(self, msg):
 708         """Print msg to screen, prefixing it with '[ie_name]'"""
 709         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 710
 711     def report_extraction(self, id_or_name):
 712         """Report information extraction."""
 713         self.to_screen('%s: Extracting information' % id_or_name)
 714
 715     def report_download_webpage(self, video_id):
 716         """Report webpage download."""
 717         self.to_screen('%s: Downloading webpage' % video_id)
 718
 719     def report_age_confirmation(self):
 720         """Report attempt to confirm age."""
 721         self.to_screen('Confirming age')
 722
 723     def report_login(self):
 724         """Report attempt to log in."""
 725         self.to_screen('Logging in')
 726
 727     @staticmethod
 728     def raise_login_required(msg='This video is only available for registered users'):
 729         raise ExtractorError(
 730             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 731             expected=True)
 732
 733     @staticmethod
 734     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 735         raise GeoRestrictedError(msg, countries=countries)
 736
 737     # Methods for following #608
 738     @staticmethod
 739     def url_result(url, ie=None, video_id=None, video_title=None):
 740         """Returns a URL that points to a page that should be processed"""
 741         # TODO: ie should be the class used for getting the info
 742         video_info = {'_type': 'url',
 743                       'url': url,
 744                       'ie_key': ie}
 745         if video_id is not None:
 746             video_info['id'] = video_id
 747         if video_title is not None:
 748             video_info['title'] = video_title
 749         return video_info
 750
 751     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 752         urls = orderedSet(
 753             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 754             for m in matches)
 755         return self.playlist_result(
 756             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 757
 758     @staticmethod
 759     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 760         """Returns a playlist"""
 761         video_info = {'_type': 'playlist',
 762                       'entries': entries}
 763         if playlist_id:
 764             video_info['id'] = playlist_id
 765         if playlist_title:
 766             video_info['title'] = playlist_title
 767         if playlist_description:
 768             video_info['description'] = playlist_description
 769         return video_info
 770
 771     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 772         """
 773         Perform a regex search on the given string, using a single or a list of
 774         patterns returning the first matching group.
 775         In case of failure return a default value or raise a WARNING or a
 776         RegexNotFoundError, depending on fatal, specifying the field name.
 777         """
 778         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 779             mobj = re.search(pattern, string, flags)
 780         else:
 781             for p in pattern:
 782                 mobj = re.search(p, string, flags)
 783                 if mobj:
 784                     break
 785
 786         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 787             _name = '\033[0;34m%s\033[0m' % name
 788         else:
 789             _name = name
 790
 791         if mobj:
 792             if group is None:
 793                 # return the first matching group
 794                 return next(g for g in mobj.groups() if g is not None)
 795             else:
 796                 return mobj.group(group)
 797         elif default is not NO_DEFAULT:
 798             return default
 799         elif fatal:
 800             raise RegexNotFoundError('Unable to extract %s' % _name)
 801         else:
 802             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 803             return None
 804
 805     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 806         """
 807         Like _search_regex, but strips HTML tags and unescapes entities.
 808         """
 809         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 810         if res:
 811             return clean_html(res).strip()
 812         else:
 813             return res
 814
 815     def _get_netrc_login_info(self, netrc_machine=None):
 816         username = None
 817         password = None
 818         netrc_machine = netrc_machine or self._NETRC_MACHINE
 819
 820         if self._downloader.params.get('usenetrc', False):
 821             try:
 822                 info = netrc.netrc().authenticators(netrc_machine)
 823                 if info is not None:
 824                     username = info[0]
 825                     password = info[2]
 826                 else:
 827                     raise netrc.NetrcParseError(
 828                         'No authenticators for %s' % netrc_machine)
 829             except (IOError, netrc.NetrcParseError) as err:
 830                 self._downloader.report_warning(
 831                     'parsing .netrc: %s' % error_to_compat_str(err))
 832
 833         return username, password
 834
 835     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 836         """
 837         Get the login info as (username, password)
 838         First look for the manually specified credentials using username_option
 839         and password_option as keys in params dictionary. If no such credentials
 840         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 841         value.
 842         If there's no info available, return (None, None)
 843         """
 844         if self._downloader is None:
 845             return (None, None)
 846
 847         downloader_params = self._downloader.params
 848
 849         # Attempt to use provided username and password or .netrc data
 850         if downloader_params.get(username_option) is not None:
 851             username = downloader_params[username_option]
 852             password = downloader_params[password_option]
 853         else:
 854             username, password = self._get_netrc_login_info(netrc_machine)
 855
 856         return username, password
 857
 858     def _get_tfa_info(self, note='two-factor verification code'):
 859         """
 860         Get the two-factor authentication info
 861         TODO - asking the user will be required for sms/phone verify
 862         currently just uses the command line option
 863         If there's no info available, return None
 864         """
 865         if self._downloader is None:
 866             return None
 867         downloader_params = self._downloader.params
 868
 869         if downloader_params.get('twofactor') is not None:
 870             return downloader_params['twofactor']
 871
 872         return compat_getpass('Type %s and press [Return]: ' % note)
 873
 874     # Helper functions for extracting OpenGraph info
 875     @staticmethod
 876     def _og_regexes(prop):
 877         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 878         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 879                        % {'prop': re.escape(prop)})
 880         template = r'<meta[^>]+?%s[^>]+?%s'
 881         return [
 882             template % (property_re, content_re),
 883             template % (content_re, property_re),
 884         ]
 885
 886     @staticmethod
 887     def _meta_regex(prop):
 888         return r'''(?isx)<meta
 889                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 890                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 891
 892     def _og_search_property(self, prop, html, name=None, **kargs):
 893         if not isinstance(prop, (list, tuple)):
 894             prop = [prop]
 895         if name is None:
 896             name = 'OpenGraph %s' % prop[0]
 897         og_regexes = []
 898         for p in prop:
 899             og_regexes.extend(self._og_regexes(p))
 900         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 901         if escaped is None:
 902             return None
 903         return unescapeHTML(escaped)
 904
 905     def _og_search_thumbnail(self, html, **kargs):
 906         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 907
 908     def _og_search_description(self, html, **kargs):
 909         return self._og_search_property('description', html, fatal=False, **kargs)
 910
 911     def _og_search_title(self, html, **kargs):
 912         return self._og_search_property('title', html, **kargs)
 913
 914     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 915         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 916         if secure:
 917             regexes = self._og_regexes('video:secure_url') + regexes
 918         return self._html_search_regex(regexes, html, name, **kargs)
 919
 920     def _og_search_url(self, html, **kargs):
 921         return self._og_search_property('url', html, **kargs)
 922
 923     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 924         if not isinstance(name, (list, tuple)):
 925             name = [name]
 926         if display_name is None:
 927             display_name = name[0]
 928         return self._html_search_regex(
 929             [self._meta_regex(n) for n in name],
 930             html, display_name, fatal=fatal, group='content', **kwargs)
 931
 932     def _dc_search_uploader(self, html):
 933         return self._html_search_meta('dc.creator', html, 'uploader')
 934
 935     def _rta_search(self, html):
 936         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 937         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 938                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 939                      html):
 940             return 18
 941         return 0
 942
 943     def _media_rating_search(self, html):
 944         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 945         rating = self._html_search_meta('rating', html)
 946
 947         if not rating:
 948             return None
 949
 950         RATING_TABLE = {
 951             'safe for kids': 0,
 952             'general': 8,
 953             '14 years': 14,
 954             'mature': 17,
 955             'restricted': 19,
 956         }
 957         return RATING_TABLE.get(rating.lower())
 958
 959     def _family_friendly_search(self, html):
 960         # See http://schema.org/VideoObject
 961         family_friendly = self._html_search_meta(
 962             'isFamilyFriendly', html, default=None)
 963
 964         if not family_friendly:
 965             return None
 966
 967         RATING_TABLE = {
 968             '1': 0,
 969             'true': 0,
 970             '0': 18,
 971             'false': 18,
 972         }
 973         return RATING_TABLE.get(family_friendly.lower())
 974
 975     def _twitter_search_player(self, html):
 976         return self._html_search_meta('twitter:player', html,
 977                                       'twitter card player')
 978
 979     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 980         json_ld = self._search_regex(
 981             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 982             html, 'JSON-LD', group='json_ld', **kwargs)
 983         default = kwargs.get('default', NO_DEFAULT)
 984         if not json_ld:
 985             return default if default is not NO_DEFAULT else {}
 986         # JSON-LD may be malformed and thus `fatal` should be respected.
 987         # At the same time `default` may be passed that assumes `fatal=False`
 988         # for _search_regex. Let's simulate the same behavior here as well.
 989         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 990         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 991
 992     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 993         if isinstance(json_ld, compat_str):
 994             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 995         if not json_ld:
 996             return {}
 997         info = {}
 998         if not isinstance(json_ld, (list, tuple, dict)):
 999             return info
1000         if isinstance(json_ld, dict):
1001             json_ld = [json_ld]
1002
1003         def extract_video_object(e):
1004             assert e['@type'] == 'VideoObject'
1005             info.update({
1006                 'url': e.get('contentUrl'),
1007                 'title': unescapeHTML(e.get('name')),
1008                 'description': unescapeHTML(e.get('description')),
1009                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1010                 'duration': parse_duration(e.get('duration')),
1011                 'timestamp': unified_timestamp(e.get('uploadDate')),
1012                 'filesize': float_or_none(e.get('contentSize')),
1013                 'tbr': int_or_none(e.get('bitrate')),
1014                 'width': int_or_none(e.get('width')),
1015                 'height': int_or_none(e.get('height')),
1016                 'view_count': int_or_none(e.get('interactionCount')),
1017             })
1018
1019         for e in json_ld:
1020             if e.get('@context') == 'http://schema.org':
1021                 item_type = e.get('@type')
1022                 if expected_type is not None and expected_type != item_type:
1023                     return info
1024                 if item_type in ('TVEpisode', 'Episode'):
1025                     info.update({
1026                         'episode': unescapeHTML(e.get('name')),
1027                         'episode_number': int_or_none(e.get('episodeNumber')),
1028                         'description': unescapeHTML(e.get('description')),
1029                     })
1030                     part_of_season = e.get('partOfSeason')
1031                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1032                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1033                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1034                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1035                         info['series'] = unescapeHTML(part_of_series.get('name'))
1036                 elif item_type == 'Article':
1037                     info.update({
1038                         'timestamp': parse_iso8601(e.get('datePublished')),
1039                         'title': unescapeHTML(e.get('headline')),
1040                         'description': unescapeHTML(e.get('articleBody')),
1041                     })
1042                 elif item_type == 'VideoObject':
1043                     extract_video_object(e)
1044                     continue
1045                 video = e.get('video')
1046                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1047                     extract_video_object(video)
1048                 break
1049         return dict((k, v) for k, v in info.items() if v is not None)
1050
1051     @staticmethod
1052     def _hidden_inputs(html):
1053         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1054         hidden_inputs = {}
1055         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1056             attrs = extract_attributes(input)
1057             if not input:
1058                 continue
1059             if attrs.get('type') not in ('hidden', 'submit'):
1060                 continue
1061             name = attrs.get('name') or attrs.get('id')
1062             value = attrs.get('value')
1063             if name and value is not None:
1064                 hidden_inputs[name] = value
1065         return hidden_inputs
1066
1067     def _form_hidden_inputs(self, form_id, html):
1068         form = self._search_regex(
1069             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1070             html, '%s form' % form_id, group='form')
1071         return self._hidden_inputs(form)
1072
1073     def _sort_formats(self, formats, field_preference=None):
1074         if not formats:
1075             raise ExtractorError('No video formats found')
1076
1077         for f in formats:
1078             # Automatically determine tbr when missing based on abr and vbr (improves
1079             # formats sorting in some cases)
1080             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1081                 f['tbr'] = f['abr'] + f['vbr']
1082
1083         def _formats_key(f):
1084             # TODO remove the following workaround
1085             from ..utils import determine_ext
1086             if not f.get('ext') and 'url' in f:
1087                 f['ext'] = determine_ext(f['url'])
1088
1089             if isinstance(field_preference, (list, tuple)):
1090                 return tuple(
1091                     f.get(field)
1092                     if f.get(field) is not None
1093                     else ('' if field == 'format_id' else -1)
1094                     for field in field_preference)
1095
1096             preference = f.get('preference')
1097             if preference is None:
1098                 preference = 0
1099                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1100                     preference -= 0.5
1101
1102             protocol = f.get('protocol') or determine_protocol(f)
1103             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1104
1105             if f.get('vcodec') == 'none':  # audio only
1106                 preference -= 50
1107                 if self._downloader.params.get('prefer_free_formats'):
1108                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1109                 else:
1110                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1111                 ext_preference = 0
1112                 try:
1113                     audio_ext_preference = ORDER.index(f['ext'])
1114                 except ValueError:
1115                     audio_ext_preference = -1
1116             else:
1117                 if f.get('acodec') == 'none':  # video only
1118                     preference -= 40
1119                 if self._downloader.params.get('prefer_free_formats'):
1120                     ORDER = ['flv', 'mp4', 'webm']
1121                 else:
1122                     ORDER = ['webm', 'flv', 'mp4']
1123                 try:
1124                     ext_preference = ORDER.index(f['ext'])
1125                 except ValueError:
1126                     ext_preference = -1
1127                 audio_ext_preference = 0
1128
1129             return (
1130                 preference,
1131                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1132                 f.get('quality') if f.get('quality') is not None else -1,
1133                 f.get('tbr') if f.get('tbr') is not None else -1,
1134                 f.get('filesize') if f.get('filesize') is not None else -1,
1135                 f.get('vbr') if f.get('vbr') is not None else -1,
1136                 f.get('height') if f.get('height') is not None else -1,
1137                 f.get('width') if f.get('width') is not None else -1,
1138                 proto_preference,
1139                 ext_preference,
1140                 f.get('abr') if f.get('abr') is not None else -1,
1141                 audio_ext_preference,
1142                 f.get('fps') if f.get('fps') is not None else -1,
1143                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1144                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1145                 f.get('format_id') if f.get('format_id') is not None else '',
1146             )
1147         formats.sort(key=_formats_key)
1148
1149     def _check_formats(self, formats, video_id):
1150         if formats:
1151             formats[:] = filter(
1152                 lambda f: self._is_valid_url(
1153                     f['url'], video_id,
1154                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1155                 formats)
1156
1157     @staticmethod
1158     def _remove_duplicate_formats(formats):
1159         format_urls = set()
1160         unique_formats = []
1161         for f in formats:
1162             if f['url'] not in format_urls:
1163                 format_urls.add(f['url'])
1164                 unique_formats.append(f)
1165         formats[:] = unique_formats
1166
1167     def _is_valid_url(self, url, video_id, item='video', headers={}):
1168         url = self._proto_relative_url(url, scheme='http:')
1169         # For now assume non HTTP(S) URLs always valid
1170         if not (url.startswith('http://') or url.startswith('https://')):
1171             return True
1172         try:
1173             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1174             return True
1175         except ExtractorError as e:
1176             if isinstance(e.cause, compat_urllib_error.URLError):
1177                 self.to_screen(
1178                     '%s: %s URL is invalid, skipping' % (video_id, item))
1179                 return False
1180             raise
1181
1182     def http_scheme(self):
1183         """ Either "http:" or "https:", depending on the user's preferences """
1184         return (
1185             'http:'
1186             if self._downloader.params.get('prefer_insecure', False)
1187             else 'https:')
1188
1189     def _proto_relative_url(self, url, scheme=None):
1190         if url is None:
1191             return url
1192         if url.startswith('//'):
1193             if scheme is None:
1194                 scheme = self.http_scheme()
1195             return scheme + url
1196         else:
1197             return url
1198
1199     def _sleep(self, timeout, video_id, msg_template=None):
1200         if msg_template is None:
1201             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1202         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1203         self.to_screen(msg)
1204         time.sleep(timeout)
1205
1206     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1207                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1208                              fatal=True, m3u8_id=None):
1209         manifest = self._download_xml(
1210             manifest_url, video_id, 'Downloading f4m manifest',
1211             'Unable to download f4m manifest',
1212             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1213             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1214             transform_source=transform_source,
1215             fatal=fatal)
1216
1217         if manifest is False:
1218             return []
1219
1220         return self._parse_f4m_formats(
1221             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1222             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1223
1224     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1225                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1226                            fatal=True, m3u8_id=None):
1227         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1228         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1229         if akamai_pv is not None and ';' in akamai_pv.text:
1230             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1231             if playerVerificationChallenge.strip() != '':
1232                 return []
1233
1234         formats = []
1235         manifest_version = '1.0'
1236         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1237         if not media_nodes:
1238             manifest_version = '2.0'
1239             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1240         # Remove unsupported DRM protected media from final formats
1241         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1242         media_nodes = remove_encrypted_media(media_nodes)
1243         if not media_nodes:
1244             return formats
1245
1246         manifest_base_url = get_base_url(manifest)
1247
1248         bootstrap_info = xpath_element(
1249             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1250             'bootstrap info', default=None)
1251
1252         vcodec = None
1253         mime_type = xpath_text(
1254             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1255             'base URL', default=None)
1256         if mime_type and mime_type.startswith('audio/'):
1257             vcodec = 'none'
1258
1259         for i, media_el in enumerate(media_nodes):
1260             tbr = int_or_none(media_el.attrib.get('bitrate'))
1261             width = int_or_none(media_el.attrib.get('width'))
1262             height = int_or_none(media_el.attrib.get('height'))
1263             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1264             # If <bootstrapInfo> is present, the specified f4m is a
1265             # stream-level manifest, and only set-level manifests may refer to
1266             # external resources.  See section 11.4 and section 4 of F4M spec
1267             if bootstrap_info is None:
1268                 media_url = None
1269                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1270                 if manifest_version == '2.0':
1271                     media_url = media_el.attrib.get('href')
1272                 if media_url is None:
1273                     media_url = media_el.attrib.get('url')
1274                 if not media_url:
1275                     continue
1276                 manifest_url = (
1277                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1278                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1279                 # If media_url is itself a f4m manifest do the recursive extraction
1280                 # since bitrates in parent manifest (this one) and media_url manifest
1281                 # may differ leading to inability to resolve the format by requested
1282                 # bitrate in f4m downloader
1283                 ext = determine_ext(manifest_url)
1284                 if ext == 'f4m':
1285                     f4m_formats = self._extract_f4m_formats(
1286                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1287                         transform_source=transform_source, fatal=fatal)
1288                     # Sometimes stream-level manifest contains single media entry that
1289                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1290                     # At the same time parent's media entry in set-level manifest may
1291                     # contain it. We will copy it from parent in such cases.
1292                     if len(f4m_formats) == 1:
1293                         f = f4m_formats[0]
1294                         f.update({
1295                             'tbr': f.get('tbr') or tbr,
1296                             'width': f.get('width') or width,
1297                             'height': f.get('height') or height,
1298                             'format_id': f.get('format_id') if not tbr else format_id,
1299                             'vcodec': vcodec,
1300                         })
1301                     formats.extend(f4m_formats)
1302                     continue
1303                 elif ext == 'm3u8':
1304                     formats.extend(self._extract_m3u8_formats(
1305                         manifest_url, video_id, 'mp4', preference=preference,
1306                         m3u8_id=m3u8_id, fatal=fatal))
1307                     continue
1308             formats.append({
1309                 'format_id': format_id,
1310                 'url': manifest_url,
1311                 'manifest_url': manifest_url,
1312                 'ext': 'flv' if bootstrap_info is not None else None,
1313                 'protocol': 'f4m',
1314                 'tbr': tbr,
1315                 'width': width,
1316                 'height': height,
1317                 'vcodec': vcodec,
1318                 'preference': preference,
1319             })
1320         return formats
1321
1322     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1323         return {
1324             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1325             'url': m3u8_url,
1326             'ext': ext,
1327             'protocol': 'm3u8',
1328             'preference': preference - 100 if preference else -100,
1329             'resolution': 'multiple',
1330             'format_note': 'Quality selection URL',
1331         }
1332
1333     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1334                               entry_protocol='m3u8', preference=None,
1335                               m3u8_id=None, note=None, errnote=None,
1336                               fatal=True, live=False):
1337         res = self._download_webpage_handle(
1338             m3u8_url, video_id,
1339             note=note or 'Downloading m3u8 information',
1340             errnote=errnote or 'Failed to download m3u8 information',
1341             fatal=fatal)
1342
1343         if res is False:
1344             return []
1345
1346         m3u8_doc, urlh = res
1347         m3u8_url = urlh.geturl()
1348
1349         return self._parse_m3u8_formats(
1350             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1351             preference=preference, m3u8_id=m3u8_id, live=live)
1352
1353     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1354                             entry_protocol='m3u8', preference=None,
1355                             m3u8_id=None, live=False):
1356         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1357             return []
1358
1359         formats = []
1360
1361         format_url = lambda u: (
1362             u
1363             if re.match(r'^https?://', u)
1364             else compat_urlparse.urljoin(m3u8_url, u))
1365
1366         # References:
1367         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1368         # 2. https://github.com/rg3/youtube-dl/issues/12211
1369
1370         # We should try extracting formats only from master playlists [1, 4.3.4],
1371         # i.e. playlists that describe available qualities. On the other hand
1372         # media playlists [1, 4.3.3] should be returned as is since they contain
1373         # just the media without qualities renditions.
1374         # Fortunately, master playlist can be easily distinguished from media
1375         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1376         # master playlist tags MUST NOT appear in a media playist and vice versa.
1377         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1378         # media playlist and MUST NOT appear in master playlist thus we can
1379         # clearly detect media playlist with this criterion.
1380
1381         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1382             return [{
1383                 'url': m3u8_url,
1384                 'format_id': m3u8_id,
1385                 'ext': ext,
1386                 'protocol': entry_protocol,
1387                 'preference': preference,
1388             }]
1389
1390         groups = {}
1391         last_stream_inf = {}
1392
1393         def extract_media(x_media_line):
1394             media = parse_m3u8_attributes(x_media_line)
1395             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1396             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1397             if not (media_type and group_id and name):
1398                 return
1399             groups.setdefault(group_id, []).append(media)
1400             if media_type not in ('VIDEO', 'AUDIO'):
1401                 return
1402             media_url = media.get('URI')
1403             if media_url:
1404                 format_id = []
1405                 for v in (m3u8_id, group_id, name):
1406                     if v:
1407                         format_id.append(v)
1408                 f = {
1409                     'format_id': '-'.join(format_id),
1410                     'url': format_url(media_url),
1411                     'manifest_url': m3u8_url,
1412                     'language': media.get('LANGUAGE'),
1413                     'ext': ext,
1414                     'protocol': entry_protocol,
1415                     'preference': preference,
1416                 }
1417                 if media_type == 'AUDIO':
1418                     f['vcodec'] = 'none'
1419                 formats.append(f)
1420
1421         def build_stream_name():
1422             # Despite specification does not mention NAME attribute for
1423             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1424             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1425             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1426             stream_name = last_stream_inf.get('NAME')
1427             if stream_name:
1428                 return stream_name
1429             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1430             # from corresponding rendition group
1431             stream_group_id = last_stream_inf.get('VIDEO')
1432             if not stream_group_id:
1433                 return
1434             stream_group = groups.get(stream_group_id)
1435             if not stream_group:
1436                 return stream_group_id
1437             rendition = stream_group[0]
1438             return rendition.get('NAME') or stream_group_id
1439
1440         for line in m3u8_doc.splitlines():
1441             if line.startswith('#EXT-X-STREAM-INF:'):
1442                 last_stream_inf = parse_m3u8_attributes(line)
1443             elif line.startswith('#EXT-X-MEDIA:'):
1444                 extract_media(line)
1445             elif line.startswith('#') or not line.strip():
1446                 continue
1447             else:
1448                 tbr = float_or_none(
1449                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1450                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1451                 format_id = []
1452                 if m3u8_id:
1453                     format_id.append(m3u8_id)
1454                 stream_name = build_stream_name()
1455                 # Bandwidth of live streams may differ over time thus making
1456                 # format_id unpredictable. So it's better to keep provided
1457                 # format_id intact.
1458                 if not live:
1459                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1460                 manifest_url = format_url(line.strip())
1461                 f = {
1462                     'format_id': '-'.join(format_id),
1463                     'url': manifest_url,
1464                     'manifest_url': m3u8_url,
1465                     'tbr': tbr,
1466                     'ext': ext,
1467                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1468                     'protocol': entry_protocol,
1469                     'preference': preference,
1470                 }
1471                 resolution = last_stream_inf.get('RESOLUTION')
1472                 if resolution:
1473                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1474                     if mobj:
1475                         f['width'] = int(mobj.group('width'))
1476                         f['height'] = int(mobj.group('height'))
1477                 # Unified Streaming Platform
1478                 mobj = re.search(
1479                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1480                 if mobj:
1481                     abr, vbr = mobj.groups()
1482                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1483                     f.update({
1484                         'vbr': vbr,
1485                         'abr': abr,
1486                     })
1487                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1488                 f.update(codecs)
1489                 audio_group_id = last_stream_inf.get('AUDIO')
1490                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1491                 # references a rendition group MUST have a CODECS attribute.
1492                 # However, this is not always respected, for example, [2]
1493                 # contains EXT-X-STREAM-INF tag which references AUDIO
1494                 # rendition group but does not have CODECS and despite
1495                 # referencing audio group an audio group, it represents
1496                 # a complete (with audio and video) format. So, for such cases
1497                 # we will ignore references to rendition groups and treat them
1498                 # as complete formats.
1499                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1500                     audio_group = groups.get(audio_group_id)
1501                     if audio_group and audio_group[0].get('URI'):
1502                         # TODO: update acodec for audio only formats with
1503                         # the same GROUP-ID
1504                         f['acodec'] = 'none'
1505                 formats.append(f)
1506                 last_stream_inf = {}
1507         return formats
1508
1509     @staticmethod
1510     def _xpath_ns(path, namespace=None):
1511         if not namespace:
1512             return path
1513         out = []
1514         for c in path.split('/'):
1515             if not c or c == '.':
1516                 out.append(c)
1517             else:
1518                 out.append('{%s}%s' % (namespace, c))
1519         return '/'.join(out)
1520
1521     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1522         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1523
1524         if smil is False:
1525             assert not fatal
1526             return []
1527
1528         namespace = self._parse_smil_namespace(smil)
1529
1530         return self._parse_smil_formats(
1531             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1532
1533     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1534         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1535         if smil is False:
1536             return {}
1537         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1538
1539     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1540         return self._download_xml(
1541             smil_url, video_id, 'Downloading SMIL file',
1542             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1543
1544     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1545         namespace = self._parse_smil_namespace(smil)
1546
1547         formats = self._parse_smil_formats(
1548             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1549         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1550
1551         video_id = os.path.splitext(url_basename(smil_url))[0]
1552         title = None
1553         description = None
1554         upload_date = None
1555         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1556             name = meta.attrib.get('name')
1557             content = meta.attrib.get('content')
1558             if not name or not content:
1559                 continue
1560             if not title and name == 'title':
1561                 title = content
1562             elif not description and name in ('description', 'abstract'):
1563                 description = content
1564             elif not upload_date and name == 'date':
1565                 upload_date = unified_strdate(content)
1566
1567         thumbnails = [{
1568             'id': image.get('type'),
1569             'url': image.get('src'),
1570             'width': int_or_none(image.get('width')),
1571             'height': int_or_none(image.get('height')),
1572         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1573
1574         return {
1575             'id': video_id,
1576             'title': title or video_id,
1577             'description': description,
1578             'upload_date': upload_date,
1579             'thumbnails': thumbnails,
1580             'formats': formats,
1581             'subtitles': subtitles,
1582         }
1583
1584     def _parse_smil_namespace(self, smil):
1585         return self._search_regex(
1586             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1587
1588     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1589         base = smil_url
1590         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1591             b = meta.get('base') or meta.get('httpBase')
1592             if b:
1593                 base = b
1594                 break
1595
1596         formats = []
1597         rtmp_count = 0
1598         http_count = 0
1599         m3u8_count = 0
1600
1601         srcs = []
1602         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1603         for medium in media:
1604             src = medium.get('src')
1605             if not src or src in srcs:
1606                 continue
1607             srcs.append(src)
1608
1609             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1610             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1611             width = int_or_none(medium.get('width'))
1612             height = int_or_none(medium.get('height'))
1613             proto = medium.get('proto')
1614             ext = medium.get('ext')
1615             src_ext = determine_ext(src)
1616             streamer = medium.get('streamer') or base
1617
1618             if proto == 'rtmp' or streamer.startswith('rtmp'):
1619                 rtmp_count += 1
1620                 formats.append({
1621                     'url': streamer,
1622                     'play_path': src,
1623                     'ext': 'flv',
1624                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1625                     'tbr': bitrate,
1626                     'filesize': filesize,
1627                     'width': width,
1628                     'height': height,
1629                 })
1630                 if transform_rtmp_url:
1631                     streamer, src = transform_rtmp_url(streamer, src)
1632                     formats[-1].update({
1633                         'url': streamer,
1634                         'play_path': src,
1635                     })
1636                 continue
1637
1638             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1639             src_url = src_url.strip()
1640
1641             if proto == 'm3u8' or src_ext == 'm3u8':
1642                 m3u8_formats = self._extract_m3u8_formats(
1643                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1644                 if len(m3u8_formats) == 1:
1645                     m3u8_count += 1
1646                     m3u8_formats[0].update({
1647                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1648                         'tbr': bitrate,
1649                         'width': width,
1650                         'height': height,
1651                     })
1652                 formats.extend(m3u8_formats)
1653                 continue
1654
1655             if src_ext == 'f4m':
1656                 f4m_url = src_url
1657                 if not f4m_params:
1658                     f4m_params = {
1659                         'hdcore': '3.2.0',
1660                         'plugin': 'flowplayer-3.2.0.1',
1661                     }
1662                 f4m_url += '&' if '?' in f4m_url else '?'
1663                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1664                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1665                 continue
1666
1667             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1668                 http_count += 1
1669                 formats.append({
1670                     'url': src_url,
1671                     'ext': ext or src_ext or 'flv',
1672                     'format_id': 'http-%d' % (bitrate or http_count),
1673                     'tbr': bitrate,
1674                     'filesize': filesize,
1675                     'width': width,
1676                     'height': height,
1677                 })
1678                 continue
1679
1680         return formats
1681
1682     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1683         urls = []
1684         subtitles = {}
1685         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1686             src = textstream.get('src')
1687             if not src or src in urls:
1688                 continue
1689             urls.append(src)
1690             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1691             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1692             subtitles.setdefault(lang, []).append({
1693                 'url': src,
1694                 'ext': ext,
1695             })
1696         return subtitles
1697
1698     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1699         xspf = self._download_xml(
1700             playlist_url, playlist_id, 'Downloading xpsf playlist',
1701             'Unable to download xspf manifest', fatal=fatal)
1702         if xspf is False:
1703             return []
1704         return self._parse_xspf(xspf, playlist_id)
1705
1706     def _parse_xspf(self, playlist, playlist_id):
1707         NS_MAP = {
1708             'xspf': 'http://xspf.org/ns/0/',
1709             's1': 'http://static.streamone.nl/player/ns/0',
1710         }
1711
1712         entries = []
1713         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1714             title = xpath_text(
1715                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1716             description = xpath_text(
1717                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1718             thumbnail = xpath_text(
1719                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1720             duration = float_or_none(
1721                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1722
1723             formats = [{
1724                 'url': location.text,
1725                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1726                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1727                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1728             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1729             self._sort_formats(formats)
1730
1731             entries.append({
1732                 'id': playlist_id,
1733                 'title': title,
1734                 'description': description,
1735                 'thumbnail': thumbnail,
1736                 'duration': duration,
1737                 'formats': formats,
1738             })
1739         return entries
1740
1741     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1742         res = self._download_webpage_handle(
1743             mpd_url, video_id,
1744             note=note or 'Downloading MPD manifest',
1745             errnote=errnote or 'Failed to download MPD manifest',
1746             fatal=fatal)
1747         if res is False:
1748             return []
1749         mpd, urlh = res
1750         mpd_base_url = base_url(urlh.geturl())
1751
1752         return self._parse_mpd_formats(
1753             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1754             formats_dict=formats_dict, mpd_url=mpd_url)
1755
1756     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1757         """
1758         Parse formats from MPD manifest.
1759         References:
1760          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1761             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1762          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1763         """
1764         if mpd_doc.get('type') == 'dynamic':
1765             return []
1766
1767         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1768
1769         def _add_ns(path):
1770             return self._xpath_ns(path, namespace)
1771
1772         def is_drm_protected(element):
1773             return element.find(_add_ns('ContentProtection')) is not None
1774
1775         def extract_multisegment_info(element, ms_parent_info):
1776             ms_info = ms_parent_info.copy()
1777
1778             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1779             # common attributes and elements.  We will only extract relevant
1780             # for us.
1781             def extract_common(source):
1782                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1783                 if segment_timeline is not None:
1784                     s_e = segment_timeline.findall(_add_ns('S'))
1785                     if s_e:
1786                         ms_info['total_number'] = 0
1787                         ms_info['s'] = []
1788                         for s in s_e:
1789                             r = int(s.get('r', 0))
1790                             ms_info['total_number'] += 1 + r
1791                             ms_info['s'].append({
1792                                 't': int(s.get('t', 0)),
1793                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1794                                 'd': int(s.attrib['d']),
1795                                 'r': r,
1796                             })
1797                 start_number = source.get('startNumber')
1798                 if start_number:
1799                     ms_info['start_number'] = int(start_number)
1800                 timescale = source.get('timescale')
1801                 if timescale:
1802                     ms_info['timescale'] = int(timescale)
1803                 segment_duration = source.get('duration')
1804                 if segment_duration:
1805                     ms_info['segment_duration'] = float(segment_duration)
1806
1807             def extract_Initialization(source):
1808                 initialization = source.find(_add_ns('Initialization'))
1809                 if initialization is not None:
1810                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1811
1812             segment_list = element.find(_add_ns('SegmentList'))
1813             if segment_list is not None:
1814                 extract_common(segment_list)
1815                 extract_Initialization(segment_list)
1816                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1817                 if segment_urls_e:
1818                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1819             else:
1820                 segment_template = element.find(_add_ns('SegmentTemplate'))
1821                 if segment_template is not None:
1822                     extract_common(segment_template)
1823                     media = segment_template.get('media')
1824                     if media:
1825                         ms_info['media'] = media
1826                     initialization = segment_template.get('initialization')
1827                     if initialization:
1828                         ms_info['initialization'] = initialization
1829                     else:
1830                         extract_Initialization(segment_template)
1831             return ms_info
1832
1833         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1834         formats = []
1835         for period in mpd_doc.findall(_add_ns('Period')):
1836             period_duration = parse_duration(period.get('duration')) or mpd_duration
1837             period_ms_info = extract_multisegment_info(period, {
1838                 'start_number': 1,
1839                 'timescale': 1,
1840             })
1841             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1842                 if is_drm_protected(adaptation_set):
1843                     continue
1844                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1845                 for representation in adaptation_set.findall(_add_ns('Representation')):
1846                     if is_drm_protected(representation):
1847                         continue
1848                     representation_attrib = adaptation_set.attrib.copy()
1849                     representation_attrib.update(representation.attrib)
1850                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1851                     mime_type = representation_attrib['mimeType']
1852                     content_type = mime_type.split('/')[0]
1853                     if content_type == 'text':
1854                         # TODO implement WebVTT downloading
1855                         pass
1856                     elif content_type in ('video', 'audio'):
1857                         base_url = ''
1858                         for element in (representation, adaptation_set, period, mpd_doc):
1859                             base_url_e = element.find(_add_ns('BaseURL'))
1860                             if base_url_e is not None:
1861                                 base_url = base_url_e.text + base_url
1862                                 if re.match(r'^https?://', base_url):
1863                                     break
1864                         if mpd_base_url and not re.match(r'^https?://', base_url):
1865                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1866                                 mpd_base_url += '/'
1867                             base_url = mpd_base_url + base_url
1868                         representation_id = representation_attrib.get('id')
1869                         lang = representation_attrib.get('lang')
1870                         url_el = representation.find(_add_ns('BaseURL'))
1871                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1872                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1873                         f = {
1874                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1875                             'url': base_url,
1876                             'manifest_url': mpd_url,
1877                             'ext': mimetype2ext(mime_type),
1878                             'width': int_or_none(representation_attrib.get('width')),
1879                             'height': int_or_none(representation_attrib.get('height')),
1880                             'tbr': float_or_none(bandwidth, 1000),
1881                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1882                             'fps': int_or_none(representation_attrib.get('frameRate')),
1883                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1884                             'format_note': 'DASH %s' % content_type,
1885                             'filesize': filesize,
1886                         }
1887                         f.update(parse_codecs(representation_attrib.get('codecs')))
1888                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1889
1890                         def prepare_template(template_name, identifiers):
1891                             t = representation_ms_info[template_name]
1892                             t = t.replace('$RepresentationID$', representation_id)
1893                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1894                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1895                             t.replace('$$', '$')
1896                             return t
1897
1898                         # @initialization is a regular template like @media one
1899                         # so it should be handled just the same way (see
1900                         # https://github.com/rg3/youtube-dl/issues/11605)
1901                         if 'initialization' in representation_ms_info:
1902                             initialization_template = prepare_template(
1903                                 'initialization',
1904                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1905                                 # $Time$ shall not be included for @initialization thus
1906                                 # only $Bandwidth$ remains
1907                                 ('Bandwidth', ))
1908                             representation_ms_info['initialization_url'] = initialization_template % {
1909                                 'Bandwidth': bandwidth,
1910                             }
1911
1912                         def location_key(location):
1913                             return 'url' if re.match(r'^https?://', location) else 'path'
1914
1915                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1916
1917                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1918                             media_location_key = location_key(media_template)
1919
1920                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1921                             # can't be used at the same time
1922                             if '%(Number' in media_template and 's' not in representation_ms_info:
1923                                 segment_duration = None
1924                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1925                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1926                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1927                                 representation_ms_info['fragments'] = [{
1928                                     media_location_key: media_template % {
1929                                         'Number': segment_number,
1930                                         'Bandwidth': bandwidth,
1931                                     },
1932                                     'duration': segment_duration,
1933                                 } for segment_number in range(
1934                                     representation_ms_info['start_number'],
1935                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1936                             else:
1937                                 # $Number*$ or $Time$ in media template with S list available
1938                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1939                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1940                                 representation_ms_info['fragments'] = []
1941                                 segment_time = 0
1942                                 segment_d = None
1943                                 segment_number = representation_ms_info['start_number']
1944
1945                                 def add_segment_url():
1946                                     segment_url = media_template % {
1947                                         'Time': segment_time,
1948                                         'Bandwidth': bandwidth,
1949                                         'Number': segment_number,
1950                                     }
1951                                     representation_ms_info['fragments'].append({
1952                                         media_location_key: segment_url,
1953                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1954                                     })
1955
1956                                 for num, s in enumerate(representation_ms_info['s']):
1957                                     segment_time = s.get('t') or segment_time
1958                                     segment_d = s['d']
1959                                     add_segment_url()
1960                                     segment_number += 1
1961                                     for r in range(s.get('r', 0)):
1962                                         segment_time += segment_d
1963                                         add_segment_url()
1964                                         segment_number += 1
1965                                     segment_time += segment_d
1966                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1967                             # No media template
1968                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1969                             # or any YouTube dashsegments video
1970                             fragments = []
1971                             segment_index = 0
1972                             timescale = representation_ms_info['timescale']
1973                             for s in representation_ms_info['s']:
1974                                 duration = float_or_none(s['d'], timescale)
1975                                 for r in range(s.get('r', 0) + 1):
1976                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1977                                     fragments.append({
1978                                         location_key(segment_uri): segment_uri,
1979                                         'duration': duration,
1980                                     })
1981                                     segment_index += 1
1982                             representation_ms_info['fragments'] = fragments
1983                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1984                         # No fragments key is present in this case.
1985                         if 'fragments' in representation_ms_info:
1986                             f.update({
1987                                 'fragment_base_url': base_url,
1988                                 'fragments': [],
1989                                 'protocol': 'http_dash_segments',
1990                             })
1991                             if 'initialization_url' in representation_ms_info:
1992                                 initialization_url = representation_ms_info['initialization_url']
1993                                 if not f.get('url'):
1994                                     f['url'] = initialization_url
1995                                 f['fragments'].append({location_key(initialization_url): initialization_url})
1996                             f['fragments'].extend(representation_ms_info['fragments'])
1997                         try:
1998                             existing_format = next(
1999                                 fo for fo in formats
2000                                 if fo['format_id'] == representation_id)
2001                         except StopIteration:
2002                             full_info = formats_dict.get(representation_id, {}).copy()
2003                             full_info.update(f)
2004                             formats.append(full_info)
2005                         else:
2006                             existing_format.update(f)
2007                     else:
2008                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2009         return formats
2010
2011     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2012         res = self._download_webpage_handle(
2013             ism_url, video_id,
2014             note=note or 'Downloading ISM manifest',
2015             errnote=errnote or 'Failed to download ISM manifest',
2016             fatal=fatal)
2017         if res is False:
2018             return []
2019         ism, urlh = res
2020
2021         return self._parse_ism_formats(
2022             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2023
2024     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2025         """
2026         Parse formats from ISM manifest.
2027         References:
2028          1. [MS-SSTR]: Smooth Streaming Protocol,
2029             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2030         """
2031         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2032             return []
2033
2034         duration = int(ism_doc.attrib['Duration'])
2035         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2036
2037         formats = []
2038         for stream in ism_doc.findall('StreamIndex'):
2039             stream_type = stream.get('Type')
2040             if stream_type not in ('video', 'audio'):
2041                 continue
2042             url_pattern = stream.attrib['Url']
2043             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2044             stream_name = stream.get('Name')
2045             for track in stream.findall('QualityLevel'):
2046                 fourcc = track.get('FourCC')
2047                 # TODO: add support for WVC1 and WMAP
2048                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2049                     self.report_warning('%s is not a supported codec' % fourcc)
2050                     continue
2051                 tbr = int(track.attrib['Bitrate']) // 1000
2052                 # [1] does not mention Width and Height attributes. However,
2053                 # they're often present while MaxWidth and MaxHeight are
2054                 # missing, so should be used as fallbacks
2055                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2056                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2057                 sampling_rate = int_or_none(track.get('SamplingRate'))
2058
2059                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2060                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2061
2062                 fragments = []
2063                 fragment_ctx = {
2064                     'time': 0,
2065                 }
2066                 stream_fragments = stream.findall('c')
2067                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2068                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2069                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2070                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2071                     if not fragment_ctx['duration']:
2072                         try:
2073                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2074                         except IndexError:
2075                             next_fragment_time = duration
2076                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2077                     for _ in range(fragment_repeat):
2078                         fragments.append({
2079                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2080                             'duration': fragment_ctx['duration'] / stream_timescale,
2081                         })
2082                         fragment_ctx['time'] += fragment_ctx['duration']
2083
2084                 format_id = []
2085                 if ism_id:
2086                     format_id.append(ism_id)
2087                 if stream_name:
2088                     format_id.append(stream_name)
2089                 format_id.append(compat_str(tbr))
2090
2091                 formats.append({
2092                     'format_id': '-'.join(format_id),
2093                     'url': ism_url,
2094                     'manifest_url': ism_url,
2095                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2096                     'width': width,
2097                     'height': height,
2098                     'tbr': tbr,
2099                     'asr': sampling_rate,
2100                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2101                     'acodec': 'none' if stream_type == 'video' else fourcc,
2102                     'protocol': 'ism',
2103                     'fragments': fragments,
2104                     '_download_params': {
2105                         'duration': duration,
2106                         'timescale': stream_timescale,
2107                         'width': width or 0,
2108                         'height': height or 0,
2109                         'fourcc': fourcc,
2110                         'codec_private_data': track.get('CodecPrivateData'),
2111                         'sampling_rate': sampling_rate,
2112                         'channels': int_or_none(track.get('Channels', 2)),
2113                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2114                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2115                     },
2116                 })
2117         return formats
2118
2119     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2120         def absolute_url(video_url):
2121             return compat_urlparse.urljoin(base_url, video_url)
2122
2123         def parse_content_type(content_type):
2124             if not content_type:
2125                 return {}
2126             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2127             if ctr:
2128                 mimetype, codecs = ctr.groups()
2129                 f = parse_codecs(codecs)
2130                 f['ext'] = mimetype2ext(mimetype)
2131                 return f
2132             return {}
2133
2134         def _media_formats(src, cur_media_type, type_info={}):
2135             full_url = absolute_url(src)
2136             ext = type_info.get('ext') or determine_ext(full_url)
2137             if ext == 'm3u8':
2138                 is_plain_url = False
2139                 formats = self._extract_m3u8_formats(
2140                     full_url, video_id, ext='mp4',
2141                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2142                     preference=preference, fatal=False)
2143             elif ext == 'mpd':
2144                 is_plain_url = False
2145                 formats = self._extract_mpd_formats(
2146                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2147             else:
2148                 is_plain_url = True
2149                 formats = [{
2150                     'url': full_url,
2151                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2152                 }]
2153             return is_plain_url, formats
2154
2155         entries = []
2156         # amp-video and amp-audio are very similar to their HTML5 counterparts
2157         # so we wll include them right here (see
2158         # https://www.ampproject.org/docs/reference/components/amp-video)
2159         media_tags = [(media_tag, media_type, '')
2160                       for media_tag, media_type
2161                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2162         media_tags.extend(re.findall(
2163             # We only allow video|audio followed by a whitespace or '>'.
2164             # Allowing more characters may end up in significant slow down (see
2165             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2166             # http://www.porntrex.com/maps/videositemap.xml).
2167             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2168         for media_tag, media_type, media_content in media_tags:
2169             media_info = {
2170                 'formats': [],
2171                 'subtitles': {},
2172             }
2173             media_attributes = extract_attributes(media_tag)
2174             src = media_attributes.get('src')
2175             if src:
2176                 _, formats = _media_formats(src, media_type)
2177                 media_info['formats'].extend(formats)
2178             media_info['thumbnail'] = media_attributes.get('poster')
2179             if media_content:
2180                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2181                     source_attributes = extract_attributes(source_tag)
2182                     src = source_attributes.get('src')
2183                     if not src:
2184                         continue
2185                     f = parse_content_type(source_attributes.get('type'))
2186                     is_plain_url, formats = _media_formats(src, media_type, f)
2187                     if is_plain_url:
2188                         # res attribute is not standard but seen several times
2189                         # in the wild
2190                         f.update({
2191                             'height': int_or_none(source_attributes.get('res')),
2192                             'format_id': source_attributes.get('label'),
2193                         })
2194                         f.update(formats[0])
2195                         media_info['formats'].append(f)
2196                     else:
2197                         media_info['formats'].extend(formats)
2198                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2199                     track_attributes = extract_attributes(track_tag)
2200                     kind = track_attributes.get('kind')
2201                     if not kind or kind in ('subtitles', 'captions'):
2202                         src = track_attributes.get('src')
2203                         if not src:
2204                             continue
2205                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2206                         media_info['subtitles'].setdefault(lang, []).append({
2207                             'url': absolute_url(src),
2208                         })
2209             if media_info['formats'] or media_info['subtitles']:
2210                 entries.append(media_info)
2211         return entries
2212
2213     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2214         formats = []
2215         hdcore_sign = 'hdcore=3.7.0'
2216         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2217         hds_host = hosts.get('hds')
2218         if hds_host:
2219             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2220         if 'hdcore=' not in f4m_url:
2221             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2222         f4m_formats = self._extract_f4m_formats(
2223             f4m_url, video_id, f4m_id='hds', fatal=False)
2224         for entry in f4m_formats:
2225             entry.update({'extra_param_to_segment_url': hdcore_sign})
2226         formats.extend(f4m_formats)
2227         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2228         hls_host = hosts.get('hls')
2229         if hls_host:
2230             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2231         formats.extend(self._extract_m3u8_formats(
2232             m3u8_url, video_id, 'mp4', 'm3u8_native',
2233             m3u8_id='hls', fatal=False))
2234         return formats
2235
2236     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2237         query = compat_urlparse.urlparse(url).query
2238         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2239         url_base = self._search_regex(
2240             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2241         http_base_url = '%s:%s' % ('http', url_base)
2242         formats = []
2243
2244         def manifest_url(manifest):
2245             m_url = '%s/%s' % (http_base_url, manifest)
2246             if query:
2247                 m_url += '?%s' % query
2248             return m_url
2249
2250         if 'm3u8' not in skip_protocols:
2251             formats.extend(self._extract_m3u8_formats(
2252                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2253                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2254         if 'f4m' not in skip_protocols:
2255             formats.extend(self._extract_f4m_formats(
2256                 manifest_url('manifest.f4m'),
2257                 video_id, f4m_id='hds', fatal=False))
2258         if 'dash' not in skip_protocols:
2259             formats.extend(self._extract_mpd_formats(
2260                 manifest_url('manifest.mpd'),
2261                 video_id, mpd_id='dash', fatal=False))
2262         if re.search(r'(?:/smil:|\.smil)', url_base):
2263             if 'smil' not in skip_protocols:
2264                 rtmp_formats = self._extract_smil_formats(
2265                     manifest_url('jwplayer.smil'),
2266                     video_id, fatal=False)
2267                 for rtmp_format in rtmp_formats:
2268                     rtsp_format = rtmp_format.copy()
2269                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2270                     del rtsp_format['play_path']
2271                     del rtsp_format['ext']
2272                     rtsp_format.update({
2273                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2274                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2275                         'protocol': 'rtsp',
2276                     })
2277                     formats.extend([rtmp_format, rtsp_format])
2278         else:
2279             for protocol in ('rtmp', 'rtsp'):
2280                 if protocol not in skip_protocols:
2281                     formats.append({
2282                         'url': '%s:%s' % (protocol, url_base),
2283                         'format_id': protocol,
2284                         'protocol': protocol,
2285                     })
2286         return formats
2287
2288     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2289         mobj = re.search(
2290             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2291             webpage)
2292         if mobj:
2293             try:
2294                 jwplayer_data = self._parse_json(mobj.group('options'),
2295                                                  video_id=video_id,
2296                                                  transform_source=transform_source)
2297             except ExtractorError:
2298                 pass
2299             else:
2300                 if isinstance(jwplayer_data, dict):
2301                     return jwplayer_data
2302
2303     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2304         jwplayer_data = self._find_jwplayer_data(
2305             webpage, video_id, transform_source=js_to_json)
2306         return self._parse_jwplayer_data(
2307             jwplayer_data, video_id, *args, **kwargs)
2308
2309     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2310                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2311         # JWPlayer backward compatibility: flattened playlists
2312         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2313         if 'playlist' not in jwplayer_data:
2314             jwplayer_data = {'playlist': [jwplayer_data]}
2315
2316         entries = []
2317
2318         # JWPlayer backward compatibility: single playlist item
2319         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2320         if not isinstance(jwplayer_data['playlist'], list):
2321             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2322
2323         for video_data in jwplayer_data['playlist']:
2324             # JWPlayer backward compatibility: flattened sources
2325             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2326             if 'sources' not in video_data:
2327                 video_data['sources'] = [video_data]
2328
2329             this_video_id = video_id or video_data['mediaid']
2330
2331             formats = self._parse_jwplayer_formats(
2332                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2333                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2334
2335             subtitles = {}
2336             tracks = video_data.get('tracks')
2337             if tracks and isinstance(tracks, list):
2338                 for track in tracks:
2339                     if not isinstance(track, dict):
2340                         continue
2341                     if track.get('kind') != 'captions':
2342                         continue
2343                     track_url = urljoin(base_url, track.get('file'))
2344                     if not track_url:
2345                         continue
2346                     subtitles.setdefault(track.get('label') or 'en', []).append({
2347                         'url': self._proto_relative_url(track_url)
2348                     })
2349
2350             entry = {
2351                 'id': this_video_id,
2352                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2353                 'description': video_data.get('description'),
2354                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2355                 'timestamp': int_or_none(video_data.get('pubdate')),
2356                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2357                 'subtitles': subtitles,
2358             }
2359             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2360             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2361                 entry.update({
2362                     '_type': 'url_transparent',
2363                     'url': formats[0]['url'],
2364                 })
2365             else:
2366                 self._sort_formats(formats)
2367                 entry['formats'] = formats
2368             entries.append(entry)
2369         if len(entries) == 1:
2370             return entries[0]
2371         else:
2372             return self.playlist_result(entries)
2373
2374     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2375                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2376         urls = []
2377         formats = []
2378         for source in jwplayer_sources_data:
2379             if not isinstance(source, dict):
2380                 continue
2381             source_url = self._proto_relative_url(source.get('file'))
2382             if not source_url:
2383                 continue
2384             if base_url:
2385                 source_url = compat_urlparse.urljoin(base_url, source_url)
2386             if source_url in urls:
2387                 continue
2388             urls.append(source_url)
2389             source_type = source.get('type') or ''
2390             ext = mimetype2ext(source_type) or determine_ext(source_url)
2391             if source_type == 'hls' or ext == 'm3u8':
2392                 formats.extend(self._extract_m3u8_formats(
2393                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2394                     m3u8_id=m3u8_id, fatal=False))
2395             elif ext == 'mpd':
2396                 formats.extend(self._extract_mpd_formats(
2397                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2398             elif ext == 'smil':
2399                 formats.extend(self._extract_smil_formats(
2400                     source_url, video_id, fatal=False))
2401             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2402             elif source_type.startswith('audio') or ext in (
2403                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2404                 formats.append({
2405                     'url': source_url,
2406                     'vcodec': 'none',
2407                     'ext': ext,
2408                 })
2409             else:
2410                 height = int_or_none(source.get('height'))
2411                 if height is None:
2412                     # Often no height is provided but there is a label in
2413                     # format like "1080p", "720p SD", or 1080.
2414                     height = int_or_none(self._search_regex(
2415                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2416                         'height', default=None))
2417                 a_format = {
2418                     'url': source_url,
2419                     'width': int_or_none(source.get('width')),
2420                     'height': height,
2421                     'tbr': int_or_none(source.get('bitrate')),
2422                     'ext': ext,
2423                 }
2424                 if source_url.startswith('rtmp'):
2425                     a_format['ext'] = 'flv'
2426                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2427                     # of jwplayer.flash.swf
2428                     rtmp_url_parts = re.split(
2429                         r'((?:mp4|mp3|flv):)', source_url, 1)
2430                     if len(rtmp_url_parts) == 3:
2431                         rtmp_url, prefix, play_path = rtmp_url_parts
2432                         a_format.update({
2433                             'url': rtmp_url,
2434                             'play_path': prefix + play_path,
2435                         })
2436                     if rtmp_params:
2437                         a_format.update(rtmp_params)
2438                 formats.append(a_format)
2439         return formats
2440
2441     def _live_title(self, name):
2442         """ Generate the title for a live video """
2443         now = datetime.datetime.now()
2444         now_str = now.strftime('%Y-%m-%d %H:%M')
2445         return name + ' ' + now_str
2446
2447     def _int(self, v, name, fatal=False, **kwargs):
2448         res = int_or_none(v, **kwargs)
2449         if 'get_attr' in kwargs:
2450             print(getattr(v, kwargs['get_attr']))
2451         if res is None:
2452             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2453             if fatal:
2454                 raise ExtractorError(msg)
2455             else:
2456                 self._downloader.report_warning(msg)
2457         return res
2458
2459     def _float(self, v, name, fatal=False, **kwargs):
2460         res = float_or_none(v, **kwargs)
2461         if res is None:
2462             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2463             if fatal:
2464                 raise ExtractorError(msg)
2465             else:
2466                 self._downloader.report_warning(msg)
2467         return res
2468
2469     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2470                     path='/', secure=False, discard=False, rest={}, **kwargs):
2471         cookie = compat_cookiejar.Cookie(
2472             0, name, value, port, port is not None, domain, True,
2473             domain.startswith('.'), path, True, secure, expire_time,
2474             discard, None, None, rest)
2475         self._downloader.cookiejar.set_cookie(cookie)
2476
2477     def _get_cookies(self, url):
2478         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2479         req = sanitized_Request(url)
2480         self._downloader.cookiejar.add_cookie_header(req)
2481         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2482
2483     def get_testcases(self, include_onlymatching=False):
2484         t = getattr(self, '_TEST', None)
2485         if t:
2486             assert not hasattr(self, '_TESTS'), \
2487                 '%s has _TEST and _TESTS' % type(self).__name__
2488             tests = [t]
2489         else:
2490             tests = getattr(self, '_TESTS', [])
2491         for t in tests:
2492             if not include_onlymatching and t.get('only_matching', False):
2493                 continue
2494             t['name'] = type(self).__name__[:-len('IE')]
2495             yield t
2496
2497     def is_suitable(self, age_limit):
2498         """ Test whether the extractor is generally suitable for the given
2499         age limit (i.e. pornographic sites are not, all others usually are) """
2500
2501         any_restricted = False
2502         for tc in self.get_testcases(include_onlymatching=False):
2503             if tc.get('playlist', []):
2504                 tc = tc['playlist'][0]
2505             is_restricted = age_restricted(
2506                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2507             if not is_restricted:
2508                 return True
2509             any_restricted = any_restricted or is_restricted
2510         return not any_restricted
2511
2512     def extract_subtitles(self, *args, **kwargs):
2513         if (self._downloader.params.get('writesubtitles', False) or
2514                 self._downloader.params.get('listsubtitles')):
2515             return self._get_subtitles(*args, **kwargs)
2516         return {}
2517
2518     def _get_subtitles(self, *args, **kwargs):
2519         raise NotImplementedError('This method must be implemented by subclasses')
2520
2521     @staticmethod
2522     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2523         """ Merge subtitle items for one language. Items with duplicated URLs
2524         will be dropped. """
2525         list1_urls = set([item['url'] for item in subtitle_list1])
2526         ret = list(subtitle_list1)
2527         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2528         return ret
2529
2530     @classmethod
2531     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2532         """ Merge two subtitle dictionaries, language by language. """
2533         ret = dict(subtitle_dict1)
2534         for lang in subtitle_dict2:
2535             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2536         return ret
2537
2538     def extract_automatic_captions(self, *args, **kwargs):
2539         if (self._downloader.params.get('writeautomaticsub', False) or
2540                 self._downloader.params.get('listsubtitles')):
2541             return self._get_automatic_captions(*args, **kwargs)
2542         return {}
2543
2544     def _get_automatic_captions(self, *args, **kwargs):
2545         raise NotImplementedError('This method must be implemented by subclasses')
2546
2547     def mark_watched(self, *args, **kwargs):
2548         if (self._downloader.params.get('mark_watched', False) and
2549                 (self._get_login_info()[0] is not None or
2550                     self._downloader.params.get('cookiefile') is not None)):
2551             self._mark_watched(*args, **kwargs)
2552
2553     def _mark_watched(self, *args, **kwargs):
2554         raise NotImplementedError('This method must be implemented by subclasses')
2555
2556     def geo_verification_headers(self):
2557         headers = {}
2558         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2559         if geo_verification_proxy:
2560             headers['Ytdl-request-proxy'] = geo_verification_proxy
2561         return headers
2562
2563     def _generic_id(self, url):
2564         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2565
2566     def _generic_title(self, url):
2567         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2568
2569
2570 class SearchInfoExtractor(InfoExtractor):
2571     """
2572     Base class for paged search queries extractors.
2573     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2574     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2575     """
2576
2577     @classmethod
2578     def _make_valid_url(cls):
2579         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2580
2581     @classmethod
2582     def suitable(cls, url):
2583         return re.match(cls._make_valid_url(), url) is not None
2584
2585     def _real_extract(self, query):
2586         mobj = re.match(self._make_valid_url(), query)
2587         if mobj is None:
2588             raise ExtractorError('Invalid search query "%s"' % query)
2589
2590         prefix = mobj.group('prefix')
2591         query = mobj.group('query')
2592         if prefix == '':
2593             return self._get_n_results(query, 1)
2594         elif prefix == 'all':
2595             return self._get_n_results(query, self._MAX_RESULTS)
2596         else:
2597             n = int(prefix)
2598             if n <= 0:
2599                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2600             elif n > self._MAX_RESULTS:
2601                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2602                 n = self._MAX_RESULTS
2603             return self._get_n_results(query, n)
2604
2605     def _get_n_results(self, query, n):
2606         """Get a specified number of results for a query"""
2607         raise NotImplementedError('This method must be implemented by subclasses')
2608
2609     @property
2610     def SEARCH_KEY(self):
2611         return self._SEARCH_KEY