[malltv] Add extractor (closes #18058)
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     url_or_none,
73     xpath_element,
74     xpath_text,
75     xpath_with_ns,
76 )
77
78
79 class InfoExtractor(object):
80     """Information Extractor class.
81
82     Information extractors are the classes that, given a URL, extract
83     information about the video (or videos) the URL refers to. This
84     information includes the real video URL, the video title, author and
85     others. The information is stored in a dictionary which is then
86     passed to the YoutubeDL. The YoutubeDL processes this
87     information possibly downloading the video to the file system, among
88     other possible outcomes.
89
90     The type field determines the type of the result.
91     By far the most common value (and the default if _type is missing) is
92     "video", which indicates a single video.
93
94     For a video, the dictionaries must include the following fields:
95
96     id:             Video identifier.
97     title:          Video title, unescaped.
98
99     Additionally, it must contain either a formats entry or a url one:
100
101     formats:        A list of dictionaries for each format available, ordered
102                     from worst to best quality.
103
104                     Potential fields:
105                     * url        Mandatory. The URL of the video file
106                     * manifest_url
107                                  The URL of the manifest file in case of
108                                  fragmented media (DASH, hls, hds)
109                     * ext        Will be calculated from URL if missing
110                     * format     A human-readable description of the format
111                                  ("mp4 container with h264/opus").
112                                  Calculated from the format_id, width, height.
113                                  and format_note fields if missing.
114                     * format_id  A short description of the format
115                                  ("mp4_h264_opus" or "19").
116                                 Technically optional, but strongly recommended.
117                     * format_note Additional info about the format
118                                  ("3D" or "DASH video")
119                     * width      Width of the video, if known
120                     * height     Height of the video, if known
121                     * resolution Textual description of width and height
122                     * tbr        Average bitrate of audio and video in KBit/s
123                     * abr        Average audio bitrate in KBit/s
124                     * acodec     Name of the audio codec in use
125                     * asr        Audio sampling rate in Hertz
126                     * vbr        Average video bitrate in KBit/s
127                     * fps        Frame rate
128                     * vcodec     Name of the video codec in use
129                     * container  Name of the container format
130                     * filesize   The number of bytes, if known in advance
131                     * filesize_approx  An estimate for the number of bytes
132                     * player_url SWF Player URL (used for rtmpdump).
133                     * protocol   The protocol that will be used for the actual
134                                  download, lower-case.
135                                  "http", "https", "rtsp", "rtmp", "rtmpe",
136                                  "m3u8", "m3u8_native" or "http_dash_segments".
137                     * fragment_base_url
138                                  Base URL for fragments. Each fragment's path
139                                  value (if present) will be relative to
140                                  this URL.
141                     * fragments  A list of fragments of a fragmented media.
142                                  Each fragment entry must contain either an url
143                                  or a path. If an url is present it should be
144                                  considered by a client. Otherwise both path and
145                                  fragment_base_url must be present. Here is
146                                  the list of all potential fields:
147                                  * "url" - fragment's URL
148                                  * "path" - fragment's path relative to
149                                             fragment_base_url
150                                  * "duration" (optional, int or float)
151                                  * "filesize" (optional, int)
152                     * preference Order number of this format. If this field is
153                                  present and not None, the formats get sorted
154                                  by this field, regardless of all other values.
155                                  -1 for default (order by other properties),
156                                  -2 or smaller for less than default.
157                                  < -1000 to hide the format (if there is
158                                     another one which is strictly better)
159                     * language   Language code, e.g. "de" or "en-US".
160                     * language_preference  Is this in the language mentioned in
161                                  the URL?
162                                  10 if it's what the URL is about,
163                                  -1 for default (don't know),
164                                  -10 otherwise, other values reserved for now.
165                     * quality    Order number of the video quality of this
166                                  format, irrespective of the file format.
167                                  -1 for default (order by other properties),
168                                  -2 or smaller for less than default.
169                     * source_preference  Order number for this video source
170                                   (quality takes higher priority)
171                                  -1 for default (order by other properties),
172                                  -2 or smaller for less than default.
173                     * http_headers  A dictionary of additional HTTP headers
174                                  to add to the request.
175                     * stretched_ratio  If given and not 1, indicates that the
176                                  video's pixels are not square.
177                                  width : height ratio as float.
178                     * no_resume  The server does not support resuming the
179                                  (HTTP or RTMP) download. Boolean.
180                     * downloader_options  A dictionary of downloader options as
181                                  described in FileDownloader
182
183     url:            Final video URL.
184     ext:            Video filename extension.
185     format:         The video format, defaults to ext (used for --get-format)
186     player_url:     SWF Player URL (used for rtmpdump).
187
188     The following fields are optional:
189
190     alt_title:      A secondary title of the video.
191     display_id      An alternative identifier for the video, not necessarily
192                     unique, but available before title. Typically, id is
193                     something like "4234987", title "Dancing naked mole rats",
194                     and display_id "dancing-naked-mole-rats"
195     thumbnails:     A list of dictionaries, with the following entries:
196                         * "id" (optional, string) - Thumbnail format ID
197                         * "url"
198                         * "preference" (optional, int) - quality of the image
199                         * "width" (optional, int)
200                         * "height" (optional, int)
201                         * "resolution" (optional, string "{width}x{height"},
202                                         deprecated)
203                         * "filesize" (optional, int)
204     thumbnail:      Full URL to a video thumbnail image.
205     description:    Full video description.
206     uploader:       Full name of the video uploader.
207     license:        License name the video is licensed under.
208     creator:        The creator of the video.
209     release_date:   The date (YYYYMMDD) when the video was released.
210     timestamp:      UNIX timestamp of the moment the video became available.
211     upload_date:    Video upload date (YYYYMMDD).
212                     If not explicitly set, calculated from timestamp.
213     uploader_id:    Nickname or id of the video uploader.
214     uploader_url:   Full URL to a personal webpage of the video uploader.
215     channel:        Full name of the channel the video is uploaded on.
216                     Note that channel fields may or may not repeat uploader
217                     fields. This depends on a particular extractor.
218     channel_id:     Id of the channel.
219     channel_url:    Full URL to a channel webpage.
220     location:       Physical location where the video was filmed.
221     subtitles:      The available subtitles as a dictionary in the format
222                     {tag: subformats}. "tag" is usually a language code, and
223                     "subformats" is a list sorted from lower to higher
224                     preference, each element is a dictionary with the "ext"
225                     entry and one of:
226                         * "data": The subtitles file contents
227                         * "url": A URL pointing to the subtitles file
228                     "ext" will be calculated from URL if missing
229     automatic_captions: Like 'subtitles', used by the YoutubeIE for
230                     automatically generated captions
231     duration:       Length of the video in seconds, as an integer or float.
232     view_count:     How many users have watched the video on the platform.
233     like_count:     Number of positive ratings of the video
234     dislike_count:  Number of negative ratings of the video
235     repost_count:   Number of reposts of the video
236     average_rating: Average rating give by users, the scale used depends on the webpage
237     comment_count:  Number of comments on the video
238     comments:       A list of comments, each with one or more of the following
239                     properties (all but one of text or html optional):
240                         * "author" - human-readable name of the comment author
241                         * "author_id" - user ID of the comment author
242                         * "id" - Comment ID
243                         * "html" - Comment as HTML
244                         * "text" - Plain text of the comment
245                         * "timestamp" - UNIX timestamp of comment
246                         * "parent" - ID of the comment this one is replying to.
247                                      Set to "root" to indicate that this is a
248                                      comment to the original video.
249     age_limit:      Age restriction for the video, as an integer (years)
250     webpage_url:    The URL to the video webpage, if given to youtube-dl it
251                     should allow to get the same result again. (It will be set
252                     by YoutubeDL if it's missing)
253     categories:     A list of categories that the video falls in, for example
254                     ["Sports", "Berlin"]
255     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
256     is_live:        True, False, or None (=unknown). Whether this video is a
257                     live stream that goes on instead of a fixed-length video.
258     start_time:     Time in seconds where the reproduction should start, as
259                     specified in the URL.
260     end_time:       Time in seconds where the reproduction should end, as
261                     specified in the URL.
262     chapters:       A list of dictionaries, with the following entries:
263                         * "start_time" - The start time of the chapter in seconds
264                         * "end_time" - The end time of the chapter in seconds
265                         * "title" (optional, string)
266
267     The following fields should only be used when the video belongs to some logical
268     chapter or section:
269
270     chapter:        Name or title of the chapter the video belongs to.
271     chapter_number: Number of the chapter the video belongs to, as an integer.
272     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
273
274     The following fields should only be used when the video is an episode of some
275     series, programme or podcast:
276
277     series:         Title of the series or programme the video episode belongs to.
278     season:         Title of the season the video episode belongs to.
279     season_number:  Number of the season the video episode belongs to, as an integer.
280     season_id:      Id of the season the video episode belongs to, as a unicode string.
281     episode:        Title of the video episode. Unlike mandatory video title field,
282                     this field should denote the exact title of the video episode
283                     without any kind of decoration.
284     episode_number: Number of the video episode within a season, as an integer.
285     episode_id:     Id of the video episode, as a unicode string.
286
287     The following fields should only be used when the media is a track or a part of
288     a music album:
289
290     track:          Title of the track.
291     track_number:   Number of the track within an album or a disc, as an integer.
292     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
293                     as a unicode string.
294     artist:         Artist(s) of the track.
295     genre:          Genre(s) of the track.
296     album:          Title of the album the track belongs to.
297     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
298     album_artist:   List of all artists appeared on the album (e.g.
299                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
300                     and compilations).
301     disc_number:    Number of the disc or other physical medium the track belongs to,
302                     as an integer.
303     release_year:   Year (YYYY) when the album was released.
304
305     Unless mentioned otherwise, the fields should be Unicode strings.
306
307     Unless mentioned otherwise, None is equivalent to absence of information.
308
309
310     _type "playlist" indicates multiple videos.
311     There must be a key "entries", which is a list, an iterable, or a PagedList
312     object, each element of which is a valid dictionary by this specification.
313
314     Additionally, playlists can have "id", "title", "description", "uploader",
315     "uploader_id", "uploader_url" attributes with the same semantics as videos
316     (see above).
317
318
319     _type "multi_video" indicates that there are multiple videos that
320     form a single show, for examples multiple acts of an opera or TV episode.
321     It must have an entries key like a playlist and contain all the keys
322     required for a video at the same time.
323
324
325     _type "url" indicates that the video must be extracted from another
326     location, possibly by a different extractor. Its only required key is:
327     "url" - the next URL to extract.
328     The key "ie_key" can be set to the class name (minus the trailing "IE",
329     e.g. "Youtube") if the extractor class is known in advance.
330     Additionally, the dictionary may have any properties of the resolved entity
331     known in advance, for example "title" if the title of the referred video is
332     known ahead of time.
333
334
335     _type "url_transparent" entities have the same specification as "url", but
336     indicate that the given additional information is more precise than the one
337     associated with the resolved URL.
338     This is useful when a site employs a video service that hosts the video and
339     its technical metadata, but that video service does not embed a useful
340     title, description etc.
341
342
343     Subclasses of this one should re-define the _real_initialize() and
344     _real_extract() methods and define a _VALID_URL regexp.
345     Probably, they should also be added to the list of extractors.
346
347     _GEO_BYPASS attribute may be set to False in order to disable
348     geo restriction bypass mechanisms for a particular extractor.
349     Though it won't disable explicit geo restriction bypass based on
350     country code provided with geo_bypass_country.
351
352     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
353     countries for this extractor. One of these countries will be used by
354     geo restriction bypass mechanism right away in order to bypass
355     geo restriction, of course, if the mechanism is not disabled.
356
357     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
358     IP blocks in CIDR notation for this extractor. One of these IP blocks
359     will be used by geo restriction bypass mechanism similarly
360     to _GEO_COUNTRIES.
361
362     Finally, the _WORKING attribute should be set to False for broken IEs
363     in order to warn the users and skip the tests.
364     """
365
366     _ready = False
367     _downloader = None
368     _x_forwarded_for_ip = None
369     _GEO_BYPASS = True
370     _GEO_COUNTRIES = None
371     _GEO_IP_BLOCKS = None
372     _WORKING = True
373
374     def __init__(self, downloader=None):
375         """Constructor. Receives an optional downloader."""
376         self._ready = False
377         self._x_forwarded_for_ip = None
378         self.set_downloader(downloader)
379
380     @classmethod
381     def suitable(cls, url):
382         """Receives a URL and returns True if suitable for this IE."""
383
384         # This does not use has/getattr intentionally - we want to know whether
385         # we have cached the regexp for *this* class, whereas getattr would also
386         # match the superclass
387         if '_VALID_URL_RE' not in cls.__dict__:
388             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
389         return cls._VALID_URL_RE.match(url) is not None
390
391     @classmethod
392     def _match_id(cls, url):
393         if '_VALID_URL_RE' not in cls.__dict__:
394             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
395         m = cls._VALID_URL_RE.match(url)
396         assert m
397         return compat_str(m.group('id'))
398
399     @classmethod
400     def working(cls):
401         """Getter method for _WORKING."""
402         return cls._WORKING
403
404     def initialize(self):
405         """Initializes an instance (authentication, etc)."""
406         self._initialize_geo_bypass({
407             'countries': self._GEO_COUNTRIES,
408             'ip_blocks': self._GEO_IP_BLOCKS,
409         })
410         if not self._ready:
411             self._real_initialize()
412             self._ready = True
413
414     def _initialize_geo_bypass(self, geo_bypass_context):
415         """
416         Initialize geo restriction bypass mechanism.
417
418         This method is used to initialize geo bypass mechanism based on faking
419         X-Forwarded-For HTTP header. A random country from provided country list
420         is selected and a random IP belonging to this country is generated. This
421         IP will be passed as X-Forwarded-For HTTP header in all subsequent
422         HTTP requests.
423
424         This method will be used for initial geo bypass mechanism initialization
425         during the instance initialization with _GEO_COUNTRIES and
426         _GEO_IP_BLOCKS.
427
428         You may also manually call it from extractor's code if geo bypass
429         information is not available beforehand (e.g. obtained during
430         extraction) or due to some other reason. In this case you should pass
431         this information in geo bypass context passed as first argument. It may
432         contain following fields:
433
434         countries:  List of geo unrestricted countries (similar
435                     to _GEO_COUNTRIES)
436         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
437                     (similar to _GEO_IP_BLOCKS)
438
439         """
440         if not self._x_forwarded_for_ip:
441
442             # Geo bypass mechanism is explicitly disabled by user
443             if not self._downloader.params.get('geo_bypass', True):
444                 return
445
446             if not geo_bypass_context:
447                 geo_bypass_context = {}
448
449             # Backward compatibility: previously _initialize_geo_bypass
450             # expected a list of countries, some 3rd party code may still use
451             # it this way
452             if isinstance(geo_bypass_context, (list, tuple)):
453                 geo_bypass_context = {
454                     'countries': geo_bypass_context,
455                 }
456
457             # The whole point of geo bypass mechanism is to fake IP
458             # as X-Forwarded-For HTTP header based on some IP block or
459             # country code.
460
461             # Path 1: bypassing based on IP block in CIDR notation
462
463             # Explicit IP block specified by user, use it right away
464             # regardless of whether extractor is geo bypassable or not
465             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
466
467             # Otherwise use random IP block from geo bypass context but only
468             # if extractor is known as geo bypassable
469             if not ip_block:
470                 ip_blocks = geo_bypass_context.get('ip_blocks')
471                 if self._GEO_BYPASS and ip_blocks:
472                     ip_block = random.choice(ip_blocks)
473
474             if ip_block:
475                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
476                 if self._downloader.params.get('verbose', False):
477                     self._downloader.to_screen(
478                         '[debug] Using fake IP %s as X-Forwarded-For.'
479                         % self._x_forwarded_for_ip)
480                 return
481
482             # Path 2: bypassing based on country code
483
484             # Explicit country code specified by user, use it right away
485             # regardless of whether extractor is geo bypassable or not
486             country = self._downloader.params.get('geo_bypass_country', None)
487
488             # Otherwise use random country code from geo bypass context but
489             # only if extractor is known as geo bypassable
490             if not country:
491                 countries = geo_bypass_context.get('countries')
492                 if self._GEO_BYPASS and countries:
493                     country = random.choice(countries)
494
495             if country:
496                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
497                 if self._downloader.params.get('verbose', False):
498                     self._downloader.to_screen(
499                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
500                         % (self._x_forwarded_for_ip, country.upper()))
501
502     def extract(self, url):
503         """Extracts URL information and returns it in list of dicts."""
504         try:
505             for _ in range(2):
506                 try:
507                     self.initialize()
508                     ie_result = self._real_extract(url)
509                     if self._x_forwarded_for_ip:
510                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
511                     return ie_result
512                 except GeoRestrictedError as e:
513                     if self.__maybe_fake_ip_and_retry(e.countries):
514                         continue
515                     raise
516         except ExtractorError:
517             raise
518         except compat_http_client.IncompleteRead as e:
519             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
520         except (KeyError, StopIteration) as e:
521             raise ExtractorError('An extractor error has occurred.', cause=e)
522
523     def __maybe_fake_ip_and_retry(self, countries):
524         if (not self._downloader.params.get('geo_bypass_country', None) and
525                 self._GEO_BYPASS and
526                 self._downloader.params.get('geo_bypass', True) and
527                 not self._x_forwarded_for_ip and
528                 countries):
529             country_code = random.choice(countries)
530             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
531             if self._x_forwarded_for_ip:
532                 self.report_warning(
533                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
534                     % (self._x_forwarded_for_ip, country_code.upper()))
535                 return True
536         return False
537
538     def set_downloader(self, downloader):
539         """Sets the downloader for this IE."""
540         self._downloader = downloader
541
542     def _real_initialize(self):
543         """Real initialization process. Redefine in subclasses."""
544         pass
545
546     def _real_extract(self, url):
547         """Real extraction process. Redefine in subclasses."""
548         pass
549
550     @classmethod
551     def ie_key(cls):
552         """A string for getting the InfoExtractor with get_info_extractor"""
553         return compat_str(cls.__name__[:-2])
554
555     @property
556     def IE_NAME(self):
557         return compat_str(type(self).__name__[:-2])
558
559     @staticmethod
560     def __can_accept_status_code(err, expected_status):
561         assert isinstance(err, compat_urllib_error.HTTPError)
562         if expected_status is None:
563             return False
564         if isinstance(expected_status, compat_integer_types):
565             return err.code == expected_status
566         elif isinstance(expected_status, (list, tuple)):
567             return err.code in expected_status
568         elif callable(expected_status):
569             return expected_status(err.code) is True
570         else:
571             assert False
572
573     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
574         """
575         Return the response handle.
576
577         See _download_webpage docstring for arguments specification.
578         """
579         if note is None:
580             self.report_download_webpage(video_id)
581         elif note is not False:
582             if video_id is None:
583                 self.to_screen('%s' % (note,))
584             else:
585                 self.to_screen('%s: %s' % (video_id, note))
586
587         # Some sites check X-Forwarded-For HTTP header in order to figure out
588         # the origin of the client behind proxy. This allows bypassing geo
589         # restriction by faking this header's value to IP that belongs to some
590         # geo unrestricted country. We will do so once we encounter any
591         # geo restriction error.
592         if self._x_forwarded_for_ip:
593             if 'X-Forwarded-For' not in headers:
594                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
595
596         if isinstance(url_or_request, compat_urllib_request.Request):
597             url_or_request = update_Request(
598                 url_or_request, data=data, headers=headers, query=query)
599         else:
600             if query:
601                 url_or_request = update_url_query(url_or_request, query)
602             if data is not None or headers:
603                 url_or_request = sanitized_Request(url_or_request, data, headers)
604         try:
605             return self._downloader.urlopen(url_or_request)
606         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
607             if isinstance(err, compat_urllib_error.HTTPError):
608                 if self.__can_accept_status_code(err, expected_status):
609                     # Retain reference to error to prevent file object from
610                     # being closed before it can be read. Works around the
611                     # effects of <https://bugs.python.org/issue15002>
612                     # introduced in Python 3.4.1.
613                     err.fp._error = err
614                     return err.fp
615
616             if errnote is False:
617                 return False
618             if errnote is None:
619                 errnote = 'Unable to download webpage'
620
621             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
622             if fatal:
623                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
624             else:
625                 self._downloader.report_warning(errmsg)
626                 return False
627
628     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
629         """
630         Return a tuple (page content as string, URL handle).
631
632         See _download_webpage docstring for arguments specification.
633         """
634         # Strip hashes from the URL (#1038)
635         if isinstance(url_or_request, (compat_str, str)):
636             url_or_request = url_or_request.partition('#')[0]
637
638         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
639         if urlh is False:
640             assert not fatal
641             return False
642         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
643         return (content, urlh)
644
645     @staticmethod
646     def _guess_encoding_from_content(content_type, webpage_bytes):
647         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
648         if m:
649             encoding = m.group(1)
650         else:
651             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
652                           webpage_bytes[:1024])
653             if m:
654                 encoding = m.group(1).decode('ascii')
655             elif webpage_bytes.startswith(b'\xff\xfe'):
656                 encoding = 'utf-16'
657             else:
658                 encoding = 'utf-8'
659
660         return encoding
661
662     def __check_blocked(self, content):
663         first_block = content[:512]
664         if ('<title>Access to this site is blocked</title>' in content and
665                 'Websense' in first_block):
666             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
667             blocked_iframe = self._html_search_regex(
668                 r'<iframe src="([^"]+)"', content,
669                 'Websense information URL', default=None)
670             if blocked_iframe:
671                 msg += ' Visit %s for more details' % blocked_iframe
672             raise ExtractorError(msg, expected=True)
673         if '<title>The URL you requested has been blocked</title>' in first_block:
674             msg = (
675                 'Access to this webpage has been blocked by Indian censorship. '
676                 'Use a VPN or proxy server (with --proxy) to route around it.')
677             block_msg = self._html_search_regex(
678                 r'</h1><p>(.*?)</p>',
679                 content, 'block message', default=None)
680             if block_msg:
681                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
682             raise ExtractorError(msg, expected=True)
683         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
684                 'blocklist.rkn.gov.ru' in content):
685             raise ExtractorError(
686                 'Access to this webpage has been blocked by decision of the Russian government. '
687                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
688                 expected=True)
689
690     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
691         content_type = urlh.headers.get('Content-Type', '')
692         webpage_bytes = urlh.read()
693         if prefix is not None:
694             webpage_bytes = prefix + webpage_bytes
695         if not encoding:
696             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
697         if self._downloader.params.get('dump_intermediate_pages', False):
698             self.to_screen('Dumping request to ' + urlh.geturl())
699             dump = base64.b64encode(webpage_bytes).decode('ascii')
700             self._downloader.to_screen(dump)
701         if self._downloader.params.get('write_pages', False):
702             basen = '%s_%s' % (video_id, urlh.geturl())
703             if len(basen) > 240:
704                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
705                 basen = basen[:240 - len(h)] + h
706             raw_filename = basen + '.dump'
707             filename = sanitize_filename(raw_filename, restricted=True)
708             self.to_screen('Saving request to ' + filename)
709             # Working around MAX_PATH limitation on Windows (see
710             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
711             if compat_os_name == 'nt':
712                 absfilepath = os.path.abspath(filename)
713                 if len(absfilepath) > 259:
714                     filename = '\\\\?\\' + absfilepath
715             with open(filename, 'wb') as outf:
716                 outf.write(webpage_bytes)
717
718         try:
719             content = webpage_bytes.decode(encoding, 'replace')
720         except LookupError:
721             content = webpage_bytes.decode('utf-8', 'replace')
722
723         self.__check_blocked(content)
724
725         return content
726
727     def _download_webpage(
728             self, url_or_request, video_id, note=None, errnote=None,
729             fatal=True, tries=1, timeout=5, encoding=None, data=None,
730             headers={}, query={}, expected_status=None):
731         """
732         Return the data of the page as a string.
733
734         Arguments:
735         url_or_request -- plain text URL as a string or
736             a compat_urllib_request.Requestobject
737         video_id -- Video/playlist/item identifier (string)
738
739         Keyword arguments:
740         note -- note printed before downloading (string)
741         errnote -- note printed in case of an error (string)
742         fatal -- flag denoting whether error should be considered fatal,
743             i.e. whether it should cause ExtractionError to be raised,
744             otherwise a warning will be reported and extraction continued
745         tries -- number of tries
746         timeout -- sleep interval between tries
747         encoding -- encoding for a page content decoding, guessed automatically
748             when not explicitly specified
749         data -- POST data (bytes)
750         headers -- HTTP headers (dict)
751         query -- URL query (dict)
752         expected_status -- allows to accept failed HTTP requests (non 2xx
753             status code) by explicitly specifying a set of accepted status
754             codes. Can be any of the following entities:
755                 - an integer type specifying an exact failed status code to
756                   accept
757                 - a list or a tuple of integer types specifying a list of
758                   failed status codes to accept
759                 - a callable accepting an actual failed status code and
760                   returning True if it should be accepted
761             Note that this argument does not affect success status codes (2xx)
762             which are always accepted.
763         """
764
765         success = False
766         try_count = 0
767         while success is False:
768             try:
769                 res = self._download_webpage_handle(
770                     url_or_request, video_id, note, errnote, fatal,
771                     encoding=encoding, data=data, headers=headers, query=query,
772                     expected_status=expected_status)
773                 success = True
774             except compat_http_client.IncompleteRead as e:
775                 try_count += 1
776                 if try_count >= tries:
777                     raise e
778                 self._sleep(timeout, video_id)
779         if res is False:
780             return res
781         else:
782             content, _ = res
783             return content
784
785     def _download_xml_handle(
786             self, url_or_request, video_id, note='Downloading XML',
787             errnote='Unable to download XML', transform_source=None,
788             fatal=True, encoding=None, data=None, headers={}, query={},
789             expected_status=None):
790         """
791         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
792
793         See _download_webpage docstring for arguments specification.
794         """
795         res = self._download_webpage_handle(
796             url_or_request, video_id, note, errnote, fatal=fatal,
797             encoding=encoding, data=data, headers=headers, query=query,
798             expected_status=expected_status)
799         if res is False:
800             return res
801         xml_string, urlh = res
802         return self._parse_xml(
803             xml_string, video_id, transform_source=transform_source,
804             fatal=fatal), urlh
805
806     def _download_xml(
807             self, url_or_request, video_id,
808             note='Downloading XML', errnote='Unable to download XML',
809             transform_source=None, fatal=True, encoding=None,
810             data=None, headers={}, query={}, expected_status=None):
811         """
812         Return the xml as an xml.etree.ElementTree.Element.
813
814         See _download_webpage docstring for arguments specification.
815         """
816         res = self._download_xml_handle(
817             url_or_request, video_id, note=note, errnote=errnote,
818             transform_source=transform_source, fatal=fatal, encoding=encoding,
819             data=data, headers=headers, query=query,
820             expected_status=expected_status)
821         return res if res is False else res[0]
822
823     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
824         if transform_source:
825             xml_string = transform_source(xml_string)
826         try:
827             return compat_etree_fromstring(xml_string.encode('utf-8'))
828         except compat_xml_parse_error as ve:
829             errmsg = '%s: Failed to parse XML ' % video_id
830             if fatal:
831                 raise ExtractorError(errmsg, cause=ve)
832             else:
833                 self.report_warning(errmsg + str(ve))
834
835     def _download_json_handle(
836             self, url_or_request, video_id, note='Downloading JSON metadata',
837             errnote='Unable to download JSON metadata', transform_source=None,
838             fatal=True, encoding=None, data=None, headers={}, query={},
839             expected_status=None):
840         """
841         Return a tuple (JSON object, URL handle).
842
843         See _download_webpage docstring for arguments specification.
844         """
845         res = self._download_webpage_handle(
846             url_or_request, video_id, note, errnote, fatal=fatal,
847             encoding=encoding, data=data, headers=headers, query=query,
848             expected_status=expected_status)
849         if res is False:
850             return res
851         json_string, urlh = res
852         return self._parse_json(
853             json_string, video_id, transform_source=transform_source,
854             fatal=fatal), urlh
855
856     def _download_json(
857             self, url_or_request, video_id, note='Downloading JSON metadata',
858             errnote='Unable to download JSON metadata', transform_source=None,
859             fatal=True, encoding=None, data=None, headers={}, query={},
860             expected_status=None):
861         """
862         Return the JSON object as a dict.
863
864         See _download_webpage docstring for arguments specification.
865         """
866         res = self._download_json_handle(
867             url_or_request, video_id, note=note, errnote=errnote,
868             transform_source=transform_source, fatal=fatal, encoding=encoding,
869             data=data, headers=headers, query=query,
870             expected_status=expected_status)
871         return res if res is False else res[0]
872
873     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
874         if transform_source:
875             json_string = transform_source(json_string)
876         try:
877             return json.loads(json_string)
878         except ValueError as ve:
879             errmsg = '%s: Failed to parse JSON ' % video_id
880             if fatal:
881                 raise ExtractorError(errmsg, cause=ve)
882             else:
883                 self.report_warning(errmsg + str(ve))
884
885     def report_warning(self, msg, video_id=None):
886         idstr = '' if video_id is None else '%s: ' % video_id
887         self._downloader.report_warning(
888             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
889
890     def to_screen(self, msg):
891         """Print msg to screen, prefixing it with '[ie_name]'"""
892         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
893
894     def report_extraction(self, id_or_name):
895         """Report information extraction."""
896         self.to_screen('%s: Extracting information' % id_or_name)
897
898     def report_download_webpage(self, video_id):
899         """Report webpage download."""
900         self.to_screen('%s: Downloading webpage' % video_id)
901
902     def report_age_confirmation(self):
903         """Report attempt to confirm age."""
904         self.to_screen('Confirming age')
905
906     def report_login(self):
907         """Report attempt to log in."""
908         self.to_screen('Logging in')
909
910     @staticmethod
911     def raise_login_required(msg='This video is only available for registered users'):
912         raise ExtractorError(
913             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
914             expected=True)
915
916     @staticmethod
917     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
918         raise GeoRestrictedError(msg, countries=countries)
919
920     # Methods for following #608
921     @staticmethod
922     def url_result(url, ie=None, video_id=None, video_title=None):
923         """Returns a URL that points to a page that should be processed"""
924         # TODO: ie should be the class used for getting the info
925         video_info = {'_type': 'url',
926                       'url': url,
927                       'ie_key': ie}
928         if video_id is not None:
929             video_info['id'] = video_id
930         if video_title is not None:
931             video_info['title'] = video_title
932         return video_info
933
934     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
935         urls = orderedSet(
936             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
937             for m in matches)
938         return self.playlist_result(
939             urls, playlist_id=playlist_id, playlist_title=playlist_title)
940
941     @staticmethod
942     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
943         """Returns a playlist"""
944         video_info = {'_type': 'playlist',
945                       'entries': entries}
946         if playlist_id:
947             video_info['id'] = playlist_id
948         if playlist_title:
949             video_info['title'] = playlist_title
950         if playlist_description:
951             video_info['description'] = playlist_description
952         return video_info
953
954     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
955         """
956         Perform a regex search on the given string, using a single or a list of
957         patterns returning the first matching group.
958         In case of failure return a default value or raise a WARNING or a
959         RegexNotFoundError, depending on fatal, specifying the field name.
960         """
961         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
962             mobj = re.search(pattern, string, flags)
963         else:
964             for p in pattern:
965                 mobj = re.search(p, string, flags)
966                 if mobj:
967                     break
968
969         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
970             _name = '\033[0;34m%s\033[0m' % name
971         else:
972             _name = name
973
974         if mobj:
975             if group is None:
976                 # return the first matching group
977                 return next(g for g in mobj.groups() if g is not None)
978             else:
979                 return mobj.group(group)
980         elif default is not NO_DEFAULT:
981             return default
982         elif fatal:
983             raise RegexNotFoundError('Unable to extract %s' % _name)
984         else:
985             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
986             return None
987
988     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
989         """
990         Like _search_regex, but strips HTML tags and unescapes entities.
991         """
992         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
993         if res:
994             return clean_html(res).strip()
995         else:
996             return res
997
998     def _get_netrc_login_info(self, netrc_machine=None):
999         username = None
1000         password = None
1001         netrc_machine = netrc_machine or self._NETRC_MACHINE
1002
1003         if self._downloader.params.get('usenetrc', False):
1004             try:
1005                 info = netrc.netrc().authenticators(netrc_machine)
1006                 if info is not None:
1007                     username = info[0]
1008                     password = info[2]
1009                 else:
1010                     raise netrc.NetrcParseError(
1011                         'No authenticators for %s' % netrc_machine)
1012             except (IOError, netrc.NetrcParseError) as err:
1013                 self._downloader.report_warning(
1014                     'parsing .netrc: %s' % error_to_compat_str(err))
1015
1016         return username, password
1017
1018     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1019         """
1020         Get the login info as (username, password)
1021         First look for the manually specified credentials using username_option
1022         and password_option as keys in params dictionary. If no such credentials
1023         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1024         value.
1025         If there's no info available, return (None, None)
1026         """
1027         if self._downloader is None:
1028             return (None, None)
1029
1030         downloader_params = self._downloader.params
1031
1032         # Attempt to use provided username and password or .netrc data
1033         if downloader_params.get(username_option) is not None:
1034             username = downloader_params[username_option]
1035             password = downloader_params[password_option]
1036         else:
1037             username, password = self._get_netrc_login_info(netrc_machine)
1038
1039         return username, password
1040
1041     def _get_tfa_info(self, note='two-factor verification code'):
1042         """
1043         Get the two-factor authentication info
1044         TODO - asking the user will be required for sms/phone verify
1045         currently just uses the command line option
1046         If there's no info available, return None
1047         """
1048         if self._downloader is None:
1049             return None
1050         downloader_params = self._downloader.params
1051
1052         if downloader_params.get('twofactor') is not None:
1053             return downloader_params['twofactor']
1054
1055         return compat_getpass('Type %s and press [Return]: ' % note)
1056
1057     # Helper functions for extracting OpenGraph info
1058     @staticmethod
1059     def _og_regexes(prop):
1060         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1061         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1062                        % {'prop': re.escape(prop)})
1063         template = r'<meta[^>]+?%s[^>]+?%s'
1064         return [
1065             template % (property_re, content_re),
1066             template % (content_re, property_re),
1067         ]
1068
1069     @staticmethod
1070     def _meta_regex(prop):
1071         return r'''(?isx)<meta
1072                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1073                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1074
1075     def _og_search_property(self, prop, html, name=None, **kargs):
1076         if not isinstance(prop, (list, tuple)):
1077             prop = [prop]
1078         if name is None:
1079             name = 'OpenGraph %s' % prop[0]
1080         og_regexes = []
1081         for p in prop:
1082             og_regexes.extend(self._og_regexes(p))
1083         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1084         if escaped is None:
1085             return None
1086         return unescapeHTML(escaped)
1087
1088     def _og_search_thumbnail(self, html, **kargs):
1089         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1090
1091     def _og_search_description(self, html, **kargs):
1092         return self._og_search_property('description', html, fatal=False, **kargs)
1093
1094     def _og_search_title(self, html, **kargs):
1095         return self._og_search_property('title', html, **kargs)
1096
1097     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1098         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1099         if secure:
1100             regexes = self._og_regexes('video:secure_url') + regexes
1101         return self._html_search_regex(regexes, html, name, **kargs)
1102
1103     def _og_search_url(self, html, **kargs):
1104         return self._og_search_property('url', html, **kargs)
1105
1106     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1107         if not isinstance(name, (list, tuple)):
1108             name = [name]
1109         if display_name is None:
1110             display_name = name[0]
1111         return self._html_search_regex(
1112             [self._meta_regex(n) for n in name],
1113             html, display_name, fatal=fatal, group='content', **kwargs)
1114
1115     def _dc_search_uploader(self, html):
1116         return self._html_search_meta('dc.creator', html, 'uploader')
1117
1118     def _rta_search(self, html):
1119         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1120         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1121                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1122                      html):
1123             return 18
1124         return 0
1125
1126     def _media_rating_search(self, html):
1127         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1128         rating = self._html_search_meta('rating', html)
1129
1130         if not rating:
1131             return None
1132
1133         RATING_TABLE = {
1134             'safe for kids': 0,
1135             'general': 8,
1136             '14 years': 14,
1137             'mature': 17,
1138             'restricted': 19,
1139         }
1140         return RATING_TABLE.get(rating.lower())
1141
1142     def _family_friendly_search(self, html):
1143         # See http://schema.org/VideoObject
1144         family_friendly = self._html_search_meta(
1145             'isFamilyFriendly', html, default=None)
1146
1147         if not family_friendly:
1148             return None
1149
1150         RATING_TABLE = {
1151             '1': 0,
1152             'true': 0,
1153             '0': 18,
1154             'false': 18,
1155         }
1156         return RATING_TABLE.get(family_friendly.lower())
1157
1158     def _twitter_search_player(self, html):
1159         return self._html_search_meta('twitter:player', html,
1160                                       'twitter card player')
1161
1162     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1163         json_ld = self._search_regex(
1164             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1165         default = kwargs.get('default', NO_DEFAULT)
1166         if not json_ld:
1167             return default if default is not NO_DEFAULT else {}
1168         # JSON-LD may be malformed and thus `fatal` should be respected.
1169         # At the same time `default` may be passed that assumes `fatal=False`
1170         # for _search_regex. Let's simulate the same behavior here as well.
1171         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1172         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1173
1174     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1175         if isinstance(json_ld, compat_str):
1176             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1177         if not json_ld:
1178             return {}
1179         info = {}
1180         if not isinstance(json_ld, (list, tuple, dict)):
1181             return info
1182         if isinstance(json_ld, dict):
1183             json_ld = [json_ld]
1184
1185         INTERACTION_TYPE_MAP = {
1186             'CommentAction': 'comment',
1187             'AgreeAction': 'like',
1188             'DisagreeAction': 'dislike',
1189             'LikeAction': 'like',
1190             'DislikeAction': 'dislike',
1191             'ListenAction': 'view',
1192             'WatchAction': 'view',
1193             'ViewAction': 'view',
1194         }
1195
1196         def extract_interaction_statistic(e):
1197             interaction_statistic = e.get('interactionStatistic')
1198             if not isinstance(interaction_statistic, list):
1199                 return
1200             for is_e in interaction_statistic:
1201                 if not isinstance(is_e, dict):
1202                     continue
1203                 if is_e.get('@type') != 'InteractionCounter':
1204                     continue
1205                 interaction_type = is_e.get('interactionType')
1206                 if not isinstance(interaction_type, compat_str):
1207                     continue
1208                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1209                 if interaction_count is None:
1210                     continue
1211                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1212                 if not count_kind:
1213                     continue
1214                 count_key = '%s_count' % count_kind
1215                 if info.get(count_key) is not None:
1216                     continue
1217                 info[count_key] = interaction_count
1218
1219         def extract_video_object(e):
1220             assert e['@type'] == 'VideoObject'
1221             info.update({
1222                 'url': url_or_none(e.get('contentUrl')),
1223                 'title': unescapeHTML(e.get('name')),
1224                 'description': unescapeHTML(e.get('description')),
1225                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1226                 'duration': parse_duration(e.get('duration')),
1227                 'timestamp': unified_timestamp(e.get('uploadDate')),
1228                 'filesize': float_or_none(e.get('contentSize')),
1229                 'tbr': int_or_none(e.get('bitrate')),
1230                 'width': int_or_none(e.get('width')),
1231                 'height': int_or_none(e.get('height')),
1232                 'view_count': int_or_none(e.get('interactionCount')),
1233             })
1234             extract_interaction_statistic(e)
1235
1236         for e in json_ld:
1237             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1238                 item_type = e.get('@type')
1239                 if expected_type is not None and expected_type != item_type:
1240                     return info
1241                 if item_type in ('TVEpisode', 'Episode'):
1242                     episode_name = unescapeHTML(e.get('name'))
1243                     info.update({
1244                         'episode': episode_name,
1245                         'episode_number': int_or_none(e.get('episodeNumber')),
1246                         'description': unescapeHTML(e.get('description')),
1247                     })
1248                     if not info.get('title') and episode_name:
1249                         info['title'] = episode_name
1250                     part_of_season = e.get('partOfSeason')
1251                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1252                         info.update({
1253                             'season': unescapeHTML(part_of_season.get('name')),
1254                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1255                         })
1256                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1257                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1258                         info['series'] = unescapeHTML(part_of_series.get('name'))
1259                 elif item_type == 'Movie':
1260                     info.update({
1261                         'title': unescapeHTML(e.get('name')),
1262                         'description': unescapeHTML(e.get('description')),
1263                         'duration': parse_duration(e.get('duration')),
1264                         'timestamp': unified_timestamp(e.get('dateCreated')),
1265                     })
1266                 elif item_type in ('Article', 'NewsArticle'):
1267                     info.update({
1268                         'timestamp': parse_iso8601(e.get('datePublished')),
1269                         'title': unescapeHTML(e.get('headline')),
1270                         'description': unescapeHTML(e.get('articleBody')),
1271                     })
1272                 elif item_type == 'VideoObject':
1273                     extract_video_object(e)
1274                     continue
1275                 video = e.get('video')
1276                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1277                     extract_video_object(video)
1278                 break
1279         return dict((k, v) for k, v in info.items() if v is not None)
1280
1281     @staticmethod
1282     def _hidden_inputs(html):
1283         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1284         hidden_inputs = {}
1285         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1286             attrs = extract_attributes(input)
1287             if not input:
1288                 continue
1289             if attrs.get('type') not in ('hidden', 'submit'):
1290                 continue
1291             name = attrs.get('name') or attrs.get('id')
1292             value = attrs.get('value')
1293             if name and value is not None:
1294                 hidden_inputs[name] = value
1295         return hidden_inputs
1296
1297     def _form_hidden_inputs(self, form_id, html):
1298         form = self._search_regex(
1299             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1300             html, '%s form' % form_id, group='form')
1301         return self._hidden_inputs(form)
1302
1303     def _sort_formats(self, formats, field_preference=None):
1304         if not formats:
1305             raise ExtractorError('No video formats found')
1306
1307         for f in formats:
1308             # Automatically determine tbr when missing based on abr and vbr (improves
1309             # formats sorting in some cases)
1310             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1311                 f['tbr'] = f['abr'] + f['vbr']
1312
1313         def _formats_key(f):
1314             # TODO remove the following workaround
1315             from ..utils import determine_ext
1316             if not f.get('ext') and 'url' in f:
1317                 f['ext'] = determine_ext(f['url'])
1318
1319             if isinstance(field_preference, (list, tuple)):
1320                 return tuple(
1321                     f.get(field)
1322                     if f.get(field) is not None
1323                     else ('' if field == 'format_id' else -1)
1324                     for field in field_preference)
1325
1326             preference = f.get('preference')
1327             if preference is None:
1328                 preference = 0
1329                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1330                     preference -= 0.5
1331
1332             protocol = f.get('protocol') or determine_protocol(f)
1333             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1334
1335             if f.get('vcodec') == 'none':  # audio only
1336                 preference -= 50
1337                 if self._downloader.params.get('prefer_free_formats'):
1338                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1339                 else:
1340                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1341                 ext_preference = 0
1342                 try:
1343                     audio_ext_preference = ORDER.index(f['ext'])
1344                 except ValueError:
1345                     audio_ext_preference = -1
1346             else:
1347                 if f.get('acodec') == 'none':  # video only
1348                     preference -= 40
1349                 if self._downloader.params.get('prefer_free_formats'):
1350                     ORDER = ['flv', 'mp4', 'webm']
1351                 else:
1352                     ORDER = ['webm', 'flv', 'mp4']
1353                 try:
1354                     ext_preference = ORDER.index(f['ext'])
1355                 except ValueError:
1356                     ext_preference = -1
1357                 audio_ext_preference = 0
1358
1359             return (
1360                 preference,
1361                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1362                 f.get('quality') if f.get('quality') is not None else -1,
1363                 f.get('tbr') if f.get('tbr') is not None else -1,
1364                 f.get('filesize') if f.get('filesize') is not None else -1,
1365                 f.get('vbr') if f.get('vbr') is not None else -1,
1366                 f.get('height') if f.get('height') is not None else -1,
1367                 f.get('width') if f.get('width') is not None else -1,
1368                 proto_preference,
1369                 ext_preference,
1370                 f.get('abr') if f.get('abr') is not None else -1,
1371                 audio_ext_preference,
1372                 f.get('fps') if f.get('fps') is not None else -1,
1373                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1374                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1375                 f.get('format_id') if f.get('format_id') is not None else '',
1376             )
1377         formats.sort(key=_formats_key)
1378
1379     def _check_formats(self, formats, video_id):
1380         if formats:
1381             formats[:] = filter(
1382                 lambda f: self._is_valid_url(
1383                     f['url'], video_id,
1384                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1385                 formats)
1386
1387     @staticmethod
1388     def _remove_duplicate_formats(formats):
1389         format_urls = set()
1390         unique_formats = []
1391         for f in formats:
1392             if f['url'] not in format_urls:
1393                 format_urls.add(f['url'])
1394                 unique_formats.append(f)
1395         formats[:] = unique_formats
1396
1397     def _is_valid_url(self, url, video_id, item='video', headers={}):
1398         url = self._proto_relative_url(url, scheme='http:')
1399         # For now assume non HTTP(S) URLs always valid
1400         if not (url.startswith('http://') or url.startswith('https://')):
1401             return True
1402         try:
1403             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1404             return True
1405         except ExtractorError as e:
1406             if isinstance(e.cause, compat_urllib_error.URLError):
1407                 self.to_screen(
1408                     '%s: %s URL is invalid, skipping' % (video_id, item))
1409                 return False
1410             raise
1411
1412     def http_scheme(self):
1413         """ Either "http:" or "https:", depending on the user's preferences """
1414         return (
1415             'http:'
1416             if self._downloader.params.get('prefer_insecure', False)
1417             else 'https:')
1418
1419     def _proto_relative_url(self, url, scheme=None):
1420         if url is None:
1421             return url
1422         if url.startswith('//'):
1423             if scheme is None:
1424                 scheme = self.http_scheme()
1425             return scheme + url
1426         else:
1427             return url
1428
1429     def _sleep(self, timeout, video_id, msg_template=None):
1430         if msg_template is None:
1431             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1432         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1433         self.to_screen(msg)
1434         time.sleep(timeout)
1435
1436     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1437                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1438                              fatal=True, m3u8_id=None):
1439         manifest = self._download_xml(
1440             manifest_url, video_id, 'Downloading f4m manifest',
1441             'Unable to download f4m manifest',
1442             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1443             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1444             transform_source=transform_source,
1445             fatal=fatal)
1446
1447         if manifest is False:
1448             return []
1449
1450         return self._parse_f4m_formats(
1451             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1452             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1453
1454     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1455                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1456                            fatal=True, m3u8_id=None):
1457         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1458         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1459         if akamai_pv is not None and ';' in akamai_pv.text:
1460             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1461             if playerVerificationChallenge.strip() != '':
1462                 return []
1463
1464         formats = []
1465         manifest_version = '1.0'
1466         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1467         if not media_nodes:
1468             manifest_version = '2.0'
1469             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1470         # Remove unsupported DRM protected media from final formats
1471         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1472         media_nodes = remove_encrypted_media(media_nodes)
1473         if not media_nodes:
1474             return formats
1475
1476         manifest_base_url = get_base_url(manifest)
1477
1478         bootstrap_info = xpath_element(
1479             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1480             'bootstrap info', default=None)
1481
1482         vcodec = None
1483         mime_type = xpath_text(
1484             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1485             'base URL', default=None)
1486         if mime_type and mime_type.startswith('audio/'):
1487             vcodec = 'none'
1488
1489         for i, media_el in enumerate(media_nodes):
1490             tbr = int_or_none(media_el.attrib.get('bitrate'))
1491             width = int_or_none(media_el.attrib.get('width'))
1492             height = int_or_none(media_el.attrib.get('height'))
1493             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1494             # If <bootstrapInfo> is present, the specified f4m is a
1495             # stream-level manifest, and only set-level manifests may refer to
1496             # external resources.  See section 11.4 and section 4 of F4M spec
1497             if bootstrap_info is None:
1498                 media_url = None
1499                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1500                 if manifest_version == '2.0':
1501                     media_url = media_el.attrib.get('href')
1502                 if media_url is None:
1503                     media_url = media_el.attrib.get('url')
1504                 if not media_url:
1505                     continue
1506                 manifest_url = (
1507                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1508                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1509                 # If media_url is itself a f4m manifest do the recursive extraction
1510                 # since bitrates in parent manifest (this one) and media_url manifest
1511                 # may differ leading to inability to resolve the format by requested
1512                 # bitrate in f4m downloader
1513                 ext = determine_ext(manifest_url)
1514                 if ext == 'f4m':
1515                     f4m_formats = self._extract_f4m_formats(
1516                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1517                         transform_source=transform_source, fatal=fatal)
1518                     # Sometimes stream-level manifest contains single media entry that
1519                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1520                     # At the same time parent's media entry in set-level manifest may
1521                     # contain it. We will copy it from parent in such cases.
1522                     if len(f4m_formats) == 1:
1523                         f = f4m_formats[0]
1524                         f.update({
1525                             'tbr': f.get('tbr') or tbr,
1526                             'width': f.get('width') or width,
1527                             'height': f.get('height') or height,
1528                             'format_id': f.get('format_id') if not tbr else format_id,
1529                             'vcodec': vcodec,
1530                         })
1531                     formats.extend(f4m_formats)
1532                     continue
1533                 elif ext == 'm3u8':
1534                     formats.extend(self._extract_m3u8_formats(
1535                         manifest_url, video_id, 'mp4', preference=preference,
1536                         m3u8_id=m3u8_id, fatal=fatal))
1537                     continue
1538             formats.append({
1539                 'format_id': format_id,
1540                 'url': manifest_url,
1541                 'manifest_url': manifest_url,
1542                 'ext': 'flv' if bootstrap_info is not None else None,
1543                 'protocol': 'f4m',
1544                 'tbr': tbr,
1545                 'width': width,
1546                 'height': height,
1547                 'vcodec': vcodec,
1548                 'preference': preference,
1549             })
1550         return formats
1551
1552     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1553         return {
1554             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1555             'url': m3u8_url,
1556             'ext': ext,
1557             'protocol': 'm3u8',
1558             'preference': preference - 100 if preference else -100,
1559             'resolution': 'multiple',
1560             'format_note': 'Quality selection URL',
1561         }
1562
1563     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1564                               entry_protocol='m3u8', preference=None,
1565                               m3u8_id=None, note=None, errnote=None,
1566                               fatal=True, live=False):
1567         res = self._download_webpage_handle(
1568             m3u8_url, video_id,
1569             note=note or 'Downloading m3u8 information',
1570             errnote=errnote or 'Failed to download m3u8 information',
1571             fatal=fatal)
1572
1573         if res is False:
1574             return []
1575
1576         m3u8_doc, urlh = res
1577         m3u8_url = urlh.geturl()
1578
1579         return self._parse_m3u8_formats(
1580             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1581             preference=preference, m3u8_id=m3u8_id, live=live)
1582
1583     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1584                             entry_protocol='m3u8', preference=None,
1585                             m3u8_id=None, live=False):
1586         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1587             return []
1588
1589         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1590             return []
1591
1592         formats = []
1593
1594         format_url = lambda u: (
1595             u
1596             if re.match(r'^https?://', u)
1597             else compat_urlparse.urljoin(m3u8_url, u))
1598
1599         # References:
1600         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1601         # 2. https://github.com/rg3/youtube-dl/issues/12211
1602         # 3. https://github.com/rg3/youtube-dl/issues/18923
1603
1604         # We should try extracting formats only from master playlists [1, 4.3.4],
1605         # i.e. playlists that describe available qualities. On the other hand
1606         # media playlists [1, 4.3.3] should be returned as is since they contain
1607         # just the media without qualities renditions.
1608         # Fortunately, master playlist can be easily distinguished from media
1609         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1610         # master playlist tags MUST NOT appear in a media playist and vice versa.
1611         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1612         # media playlist and MUST NOT appear in master playlist thus we can
1613         # clearly detect media playlist with this criterion.
1614
1615         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1616             return [{
1617                 'url': m3u8_url,
1618                 'format_id': m3u8_id,
1619                 'ext': ext,
1620                 'protocol': entry_protocol,
1621                 'preference': preference,
1622             }]
1623
1624         groups = {}
1625         last_stream_inf = {}
1626
1627         def extract_media(x_media_line):
1628             media = parse_m3u8_attributes(x_media_line)
1629             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1630             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1631             if not (media_type and group_id and name):
1632                 return
1633             groups.setdefault(group_id, []).append(media)
1634             if media_type not in ('VIDEO', 'AUDIO'):
1635                 return
1636             media_url = media.get('URI')
1637             if media_url:
1638                 format_id = []
1639                 for v in (m3u8_id, group_id, name):
1640                     if v:
1641                         format_id.append(v)
1642                 f = {
1643                     'format_id': '-'.join(format_id),
1644                     'url': format_url(media_url),
1645                     'manifest_url': m3u8_url,
1646                     'language': media.get('LANGUAGE'),
1647                     'ext': ext,
1648                     'protocol': entry_protocol,
1649                     'preference': preference,
1650                 }
1651                 if media_type == 'AUDIO':
1652                     f['vcodec'] = 'none'
1653                 formats.append(f)
1654
1655         def build_stream_name():
1656             # Despite specification does not mention NAME attribute for
1657             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1658             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1659             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1660             stream_name = last_stream_inf.get('NAME')
1661             if stream_name:
1662                 return stream_name
1663             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1664             # from corresponding rendition group
1665             stream_group_id = last_stream_inf.get('VIDEO')
1666             if not stream_group_id:
1667                 return
1668             stream_group = groups.get(stream_group_id)
1669             if not stream_group:
1670                 return stream_group_id
1671             rendition = stream_group[0]
1672             return rendition.get('NAME') or stream_group_id
1673
1674         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1675         # chance to detect video only formats when EXT-X-STREAM-INF tags
1676         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1677         for line in m3u8_doc.splitlines():
1678             if line.startswith('#EXT-X-MEDIA:'):
1679                 extract_media(line)
1680
1681         for line in m3u8_doc.splitlines():
1682             if line.startswith('#EXT-X-STREAM-INF:'):
1683                 last_stream_inf = parse_m3u8_attributes(line)
1684             elif line.startswith('#') or not line.strip():
1685                 continue
1686             else:
1687                 tbr = float_or_none(
1688                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1689                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1690                 format_id = []
1691                 if m3u8_id:
1692                     format_id.append(m3u8_id)
1693                 stream_name = build_stream_name()
1694                 # Bandwidth of live streams may differ over time thus making
1695                 # format_id unpredictable. So it's better to keep provided
1696                 # format_id intact.
1697                 if not live:
1698                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1699                 manifest_url = format_url(line.strip())
1700                 f = {
1701                     'format_id': '-'.join(format_id),
1702                     'url': manifest_url,
1703                     'manifest_url': m3u8_url,
1704                     'tbr': tbr,
1705                     'ext': ext,
1706                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1707                     'protocol': entry_protocol,
1708                     'preference': preference,
1709                 }
1710                 resolution = last_stream_inf.get('RESOLUTION')
1711                 if resolution:
1712                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1713                     if mobj:
1714                         f['width'] = int(mobj.group('width'))
1715                         f['height'] = int(mobj.group('height'))
1716                 # Unified Streaming Platform
1717                 mobj = re.search(
1718                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1719                 if mobj:
1720                     abr, vbr = mobj.groups()
1721                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1722                     f.update({
1723                         'vbr': vbr,
1724                         'abr': abr,
1725                     })
1726                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1727                 f.update(codecs)
1728                 audio_group_id = last_stream_inf.get('AUDIO')
1729                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1730                 # references a rendition group MUST have a CODECS attribute.
1731                 # However, this is not always respected, for example, [2]
1732                 # contains EXT-X-STREAM-INF tag which references AUDIO
1733                 # rendition group but does not have CODECS and despite
1734                 # referencing an audio group it represents a complete
1735                 # (with audio and video) format. So, for such cases we will
1736                 # ignore references to rendition groups and treat them
1737                 # as complete formats.
1738                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1739                     audio_group = groups.get(audio_group_id)
1740                     if audio_group and audio_group[0].get('URI'):
1741                         # TODO: update acodec for audio only formats with
1742                         # the same GROUP-ID
1743                         f['acodec'] = 'none'
1744                 formats.append(f)
1745                 last_stream_inf = {}
1746         return formats
1747
1748     @staticmethod
1749     def _xpath_ns(path, namespace=None):
1750         if not namespace:
1751             return path
1752         out = []
1753         for c in path.split('/'):
1754             if not c or c == '.':
1755                 out.append(c)
1756             else:
1757                 out.append('{%s}%s' % (namespace, c))
1758         return '/'.join(out)
1759
1760     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1761         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1762
1763         if smil is False:
1764             assert not fatal
1765             return []
1766
1767         namespace = self._parse_smil_namespace(smil)
1768
1769         return self._parse_smil_formats(
1770             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1771
1772     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1773         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1774         if smil is False:
1775             return {}
1776         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1777
1778     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1779         return self._download_xml(
1780             smil_url, video_id, 'Downloading SMIL file',
1781             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1782
1783     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1784         namespace = self._parse_smil_namespace(smil)
1785
1786         formats = self._parse_smil_formats(
1787             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1788         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1789
1790         video_id = os.path.splitext(url_basename(smil_url))[0]
1791         title = None
1792         description = None
1793         upload_date = None
1794         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1795             name = meta.attrib.get('name')
1796             content = meta.attrib.get('content')
1797             if not name or not content:
1798                 continue
1799             if not title and name == 'title':
1800                 title = content
1801             elif not description and name in ('description', 'abstract'):
1802                 description = content
1803             elif not upload_date and name == 'date':
1804                 upload_date = unified_strdate(content)
1805
1806         thumbnails = [{
1807             'id': image.get('type'),
1808             'url': image.get('src'),
1809             'width': int_or_none(image.get('width')),
1810             'height': int_or_none(image.get('height')),
1811         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1812
1813         return {
1814             'id': video_id,
1815             'title': title or video_id,
1816             'description': description,
1817             'upload_date': upload_date,
1818             'thumbnails': thumbnails,
1819             'formats': formats,
1820             'subtitles': subtitles,
1821         }
1822
1823     def _parse_smil_namespace(self, smil):
1824         return self._search_regex(
1825             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1826
1827     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1828         base = smil_url
1829         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1830             b = meta.get('base') or meta.get('httpBase')
1831             if b:
1832                 base = b
1833                 break
1834
1835         formats = []
1836         rtmp_count = 0
1837         http_count = 0
1838         m3u8_count = 0
1839
1840         srcs = []
1841         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1842         for medium in media:
1843             src = medium.get('src')
1844             if not src or src in srcs:
1845                 continue
1846             srcs.append(src)
1847
1848             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1849             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1850             width = int_or_none(medium.get('width'))
1851             height = int_or_none(medium.get('height'))
1852             proto = medium.get('proto')
1853             ext = medium.get('ext')
1854             src_ext = determine_ext(src)
1855             streamer = medium.get('streamer') or base
1856
1857             if proto == 'rtmp' or streamer.startswith('rtmp'):
1858                 rtmp_count += 1
1859                 formats.append({
1860                     'url': streamer,
1861                     'play_path': src,
1862                     'ext': 'flv',
1863                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1864                     'tbr': bitrate,
1865                     'filesize': filesize,
1866                     'width': width,
1867                     'height': height,
1868                 })
1869                 if transform_rtmp_url:
1870                     streamer, src = transform_rtmp_url(streamer, src)
1871                     formats[-1].update({
1872                         'url': streamer,
1873                         'play_path': src,
1874                     })
1875                 continue
1876
1877             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1878             src_url = src_url.strip()
1879
1880             if proto == 'm3u8' or src_ext == 'm3u8':
1881                 m3u8_formats = self._extract_m3u8_formats(
1882                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1883                 if len(m3u8_formats) == 1:
1884                     m3u8_count += 1
1885                     m3u8_formats[0].update({
1886                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1887                         'tbr': bitrate,
1888                         'width': width,
1889                         'height': height,
1890                     })
1891                 formats.extend(m3u8_formats)
1892             elif src_ext == 'f4m':
1893                 f4m_url = src_url
1894                 if not f4m_params:
1895                     f4m_params = {
1896                         'hdcore': '3.2.0',
1897                         'plugin': 'flowplayer-3.2.0.1',
1898                     }
1899                 f4m_url += '&' if '?' in f4m_url else '?'
1900                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1901                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1902             elif src_ext == 'mpd':
1903                 formats.extend(self._extract_mpd_formats(
1904                     src_url, video_id, mpd_id='dash', fatal=False))
1905             elif re.search(r'\.ism/[Mm]anifest', src_url):
1906                 formats.extend(self._extract_ism_formats(
1907                     src_url, video_id, ism_id='mss', fatal=False))
1908             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1909                 http_count += 1
1910                 formats.append({
1911                     'url': src_url,
1912                     'ext': ext or src_ext or 'flv',
1913                     'format_id': 'http-%d' % (bitrate or http_count),
1914                     'tbr': bitrate,
1915                     'filesize': filesize,
1916                     'width': width,
1917                     'height': height,
1918                 })
1919
1920         return formats
1921
1922     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1923         urls = []
1924         subtitles = {}
1925         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1926             src = textstream.get('src')
1927             if not src or src in urls:
1928                 continue
1929             urls.append(src)
1930             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1931             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1932             subtitles.setdefault(lang, []).append({
1933                 'url': src,
1934                 'ext': ext,
1935             })
1936         return subtitles
1937
1938     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1939         xspf = self._download_xml(
1940             xspf_url, playlist_id, 'Downloading xpsf playlist',
1941             'Unable to download xspf manifest', fatal=fatal)
1942         if xspf is False:
1943             return []
1944         return self._parse_xspf(
1945             xspf, playlist_id, xspf_url=xspf_url,
1946             xspf_base_url=base_url(xspf_url))
1947
1948     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1949         NS_MAP = {
1950             'xspf': 'http://xspf.org/ns/0/',
1951             's1': 'http://static.streamone.nl/player/ns/0',
1952         }
1953
1954         entries = []
1955         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1956             title = xpath_text(
1957                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1958             description = xpath_text(
1959                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1960             thumbnail = xpath_text(
1961                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1962             duration = float_or_none(
1963                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1964
1965             formats = []
1966             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1967                 format_url = urljoin(xspf_base_url, location.text)
1968                 if not format_url:
1969                     continue
1970                 formats.append({
1971                     'url': format_url,
1972                     'manifest_url': xspf_url,
1973                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1974                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1975                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1976                 })
1977             self._sort_formats(formats)
1978
1979             entries.append({
1980                 'id': playlist_id,
1981                 'title': title,
1982                 'description': description,
1983                 'thumbnail': thumbnail,
1984                 'duration': duration,
1985                 'formats': formats,
1986             })
1987         return entries
1988
1989     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1990         res = self._download_xml_handle(
1991             mpd_url, video_id,
1992             note=note or 'Downloading MPD manifest',
1993             errnote=errnote or 'Failed to download MPD manifest',
1994             fatal=fatal)
1995         if res is False:
1996             return []
1997         mpd_doc, urlh = res
1998         mpd_base_url = base_url(urlh.geturl())
1999
2000         return self._parse_mpd_formats(
2001             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2002             formats_dict=formats_dict, mpd_url=mpd_url)
2003
2004     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2005         """
2006         Parse formats from MPD manifest.
2007         References:
2008          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2009             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2010          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2011         """
2012         if mpd_doc.get('type') == 'dynamic':
2013             return []
2014
2015         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2016
2017         def _add_ns(path):
2018             return self._xpath_ns(path, namespace)
2019
2020         def is_drm_protected(element):
2021             return element.find(_add_ns('ContentProtection')) is not None
2022
2023         def extract_multisegment_info(element, ms_parent_info):
2024             ms_info = ms_parent_info.copy()
2025
2026             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2027             # common attributes and elements.  We will only extract relevant
2028             # for us.
2029             def extract_common(source):
2030                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2031                 if segment_timeline is not None:
2032                     s_e = segment_timeline.findall(_add_ns('S'))
2033                     if s_e:
2034                         ms_info['total_number'] = 0
2035                         ms_info['s'] = []
2036                         for s in s_e:
2037                             r = int(s.get('r', 0))
2038                             ms_info['total_number'] += 1 + r
2039                             ms_info['s'].append({
2040                                 't': int(s.get('t', 0)),
2041                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2042                                 'd': int(s.attrib['d']),
2043                                 'r': r,
2044                             })
2045                 start_number = source.get('startNumber')
2046                 if start_number:
2047                     ms_info['start_number'] = int(start_number)
2048                 timescale = source.get('timescale')
2049                 if timescale:
2050                     ms_info['timescale'] = int(timescale)
2051                 segment_duration = source.get('duration')
2052                 if segment_duration:
2053                     ms_info['segment_duration'] = float(segment_duration)
2054
2055             def extract_Initialization(source):
2056                 initialization = source.find(_add_ns('Initialization'))
2057                 if initialization is not None:
2058                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2059
2060             segment_list = element.find(_add_ns('SegmentList'))
2061             if segment_list is not None:
2062                 extract_common(segment_list)
2063                 extract_Initialization(segment_list)
2064                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2065                 if segment_urls_e:
2066                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2067             else:
2068                 segment_template = element.find(_add_ns('SegmentTemplate'))
2069                 if segment_template is not None:
2070                     extract_common(segment_template)
2071                     media = segment_template.get('media')
2072                     if media:
2073                         ms_info['media'] = media
2074                     initialization = segment_template.get('initialization')
2075                     if initialization:
2076                         ms_info['initialization'] = initialization
2077                     else:
2078                         extract_Initialization(segment_template)
2079             return ms_info
2080
2081         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2082         formats = []
2083         for period in mpd_doc.findall(_add_ns('Period')):
2084             period_duration = parse_duration(period.get('duration')) or mpd_duration
2085             period_ms_info = extract_multisegment_info(period, {
2086                 'start_number': 1,
2087                 'timescale': 1,
2088             })
2089             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2090                 if is_drm_protected(adaptation_set):
2091                     continue
2092                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2093                 for representation in adaptation_set.findall(_add_ns('Representation')):
2094                     if is_drm_protected(representation):
2095                         continue
2096                     representation_attrib = adaptation_set.attrib.copy()
2097                     representation_attrib.update(representation.attrib)
2098                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2099                     mime_type = representation_attrib['mimeType']
2100                     content_type = mime_type.split('/')[0]
2101                     if content_type == 'text':
2102                         # TODO implement WebVTT downloading
2103                         pass
2104                     elif content_type in ('video', 'audio'):
2105                         base_url = ''
2106                         for element in (representation, adaptation_set, period, mpd_doc):
2107                             base_url_e = element.find(_add_ns('BaseURL'))
2108                             if base_url_e is not None:
2109                                 base_url = base_url_e.text + base_url
2110                                 if re.match(r'^https?://', base_url):
2111                                     break
2112                         if mpd_base_url and not re.match(r'^https?://', base_url):
2113                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2114                                 mpd_base_url += '/'
2115                             base_url = mpd_base_url + base_url
2116                         representation_id = representation_attrib.get('id')
2117                         lang = representation_attrib.get('lang')
2118                         url_el = representation.find(_add_ns('BaseURL'))
2119                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2120                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2121                         f = {
2122                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2123                             'url': base_url,
2124                             'manifest_url': mpd_url,
2125                             'ext': mimetype2ext(mime_type),
2126                             'width': int_or_none(representation_attrib.get('width')),
2127                             'height': int_or_none(representation_attrib.get('height')),
2128                             'tbr': float_or_none(bandwidth, 1000),
2129                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2130                             'fps': int_or_none(representation_attrib.get('frameRate')),
2131                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2132                             'format_note': 'DASH %s' % content_type,
2133                             'filesize': filesize,
2134                             'container': mimetype2ext(mime_type) + '_dash',
2135                         }
2136                         f.update(parse_codecs(representation_attrib.get('codecs')))
2137                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2138
2139                         def prepare_template(template_name, identifiers):
2140                             tmpl = representation_ms_info[template_name]
2141                             # First of, % characters outside $...$ templates
2142                             # must be escaped by doubling for proper processing
2143                             # by % operator string formatting used further (see
2144                             # https://github.com/rg3/youtube-dl/issues/16867).
2145                             t = ''
2146                             in_template = False
2147                             for c in tmpl:
2148                                 t += c
2149                                 if c == '$':
2150                                     in_template = not in_template
2151                                 elif c == '%' and not in_template:
2152                                     t += c
2153                             # Next, $...$ templates are translated to their
2154                             # %(...) counterparts to be used with % operator
2155                             t = t.replace('$RepresentationID$', representation_id)
2156                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2157                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2158                             t.replace('$$', '$')
2159                             return t
2160
2161                         # @initialization is a regular template like @media one
2162                         # so it should be handled just the same way (see
2163                         # https://github.com/rg3/youtube-dl/issues/11605)
2164                         if 'initialization' in representation_ms_info:
2165                             initialization_template = prepare_template(
2166                                 'initialization',
2167                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2168                                 # $Time$ shall not be included for @initialization thus
2169                                 # only $Bandwidth$ remains
2170                                 ('Bandwidth', ))
2171                             representation_ms_info['initialization_url'] = initialization_template % {
2172                                 'Bandwidth': bandwidth,
2173                             }
2174
2175                         def location_key(location):
2176                             return 'url' if re.match(r'^https?://', location) else 'path'
2177
2178                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2179
2180                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2181                             media_location_key = location_key(media_template)
2182
2183                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2184                             # can't be used at the same time
2185                             if '%(Number' in media_template and 's' not in representation_ms_info:
2186                                 segment_duration = None
2187                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2188                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2189                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2190                                 representation_ms_info['fragments'] = [{
2191                                     media_location_key: media_template % {
2192                                         'Number': segment_number,
2193                                         'Bandwidth': bandwidth,
2194                                     },
2195                                     'duration': segment_duration,
2196                                 } for segment_number in range(
2197                                     representation_ms_info['start_number'],
2198                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2199                             else:
2200                                 # $Number*$ or $Time$ in media template with S list available
2201                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2202                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2203                                 representation_ms_info['fragments'] = []
2204                                 segment_time = 0
2205                                 segment_d = None
2206                                 segment_number = representation_ms_info['start_number']
2207
2208                                 def add_segment_url():
2209                                     segment_url = media_template % {
2210                                         'Time': segment_time,
2211                                         'Bandwidth': bandwidth,
2212                                         'Number': segment_number,
2213                                     }
2214                                     representation_ms_info['fragments'].append({
2215                                         media_location_key: segment_url,
2216                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2217                                     })
2218
2219                                 for num, s in enumerate(representation_ms_info['s']):
2220                                     segment_time = s.get('t') or segment_time
2221                                     segment_d = s['d']
2222                                     add_segment_url()
2223                                     segment_number += 1
2224                                     for r in range(s.get('r', 0)):
2225                                         segment_time += segment_d
2226                                         add_segment_url()
2227                                         segment_number += 1
2228                                     segment_time += segment_d
2229                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2230                             # No media template
2231                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2232                             # or any YouTube dashsegments video
2233                             fragments = []
2234                             segment_index = 0
2235                             timescale = representation_ms_info['timescale']
2236                             for s in representation_ms_info['s']:
2237                                 duration = float_or_none(s['d'], timescale)
2238                                 for r in range(s.get('r', 0) + 1):
2239                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2240                                     fragments.append({
2241                                         location_key(segment_uri): segment_uri,
2242                                         'duration': duration,
2243                                     })
2244                                     segment_index += 1
2245                             representation_ms_info['fragments'] = fragments
2246                         elif 'segment_urls' in representation_ms_info:
2247                             # Segment URLs with no SegmentTimeline
2248                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2249                             # https://github.com/rg3/youtube-dl/pull/14844
2250                             fragments = []
2251                             segment_duration = float_or_none(
2252                                 representation_ms_info['segment_duration'],
2253                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2254                             for segment_url in representation_ms_info['segment_urls']:
2255                                 fragment = {
2256                                     location_key(segment_url): segment_url,
2257                                 }
2258                                 if segment_duration:
2259                                     fragment['duration'] = segment_duration
2260                                 fragments.append(fragment)
2261                             representation_ms_info['fragments'] = fragments
2262                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2263                         # No fragments key is present in this case.
2264                         if 'fragments' in representation_ms_info:
2265                             f.update({
2266                                 'fragment_base_url': base_url,
2267                                 'fragments': [],
2268                                 'protocol': 'http_dash_segments',
2269                             })
2270                             if 'initialization_url' in representation_ms_info:
2271                                 initialization_url = representation_ms_info['initialization_url']
2272                                 if not f.get('url'):
2273                                     f['url'] = initialization_url
2274                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2275                             f['fragments'].extend(representation_ms_info['fragments'])
2276                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2277                         # is not necessarily unique within a Period thus formats with
2278                         # the same `format_id` are quite possible. There are numerous examples
2279                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2280                         # https://github.com/rg3/youtube-dl/issues/13919)
2281                         full_info = formats_dict.get(representation_id, {}).copy()
2282                         full_info.update(f)
2283                         formats.append(full_info)
2284                     else:
2285                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2286         return formats
2287
2288     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2289         res = self._download_xml_handle(
2290             ism_url, video_id,
2291             note=note or 'Downloading ISM manifest',
2292             errnote=errnote or 'Failed to download ISM manifest',
2293             fatal=fatal)
2294         if res is False:
2295             return []
2296         ism_doc, urlh = res
2297
2298         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2299
2300     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2301         """
2302         Parse formats from ISM manifest.
2303         References:
2304          1. [MS-SSTR]: Smooth Streaming Protocol,
2305             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2306         """
2307         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2308             return []
2309
2310         duration = int(ism_doc.attrib['Duration'])
2311         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2312
2313         formats = []
2314         for stream in ism_doc.findall('StreamIndex'):
2315             stream_type = stream.get('Type')
2316             if stream_type not in ('video', 'audio'):
2317                 continue
2318             url_pattern = stream.attrib['Url']
2319             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2320             stream_name = stream.get('Name')
2321             for track in stream.findall('QualityLevel'):
2322                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2323                 # TODO: add support for WVC1 and WMAP
2324                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2325                     self.report_warning('%s is not a supported codec' % fourcc)
2326                     continue
2327                 tbr = int(track.attrib['Bitrate']) // 1000
2328                 # [1] does not mention Width and Height attributes. However,
2329                 # they're often present while MaxWidth and MaxHeight are
2330                 # missing, so should be used as fallbacks
2331                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2332                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2333                 sampling_rate = int_or_none(track.get('SamplingRate'))
2334
2335                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2336                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2337
2338                 fragments = []
2339                 fragment_ctx = {
2340                     'time': 0,
2341                 }
2342                 stream_fragments = stream.findall('c')
2343                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2344                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2345                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2346                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2347                     if not fragment_ctx['duration']:
2348                         try:
2349                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2350                         except IndexError:
2351                             next_fragment_time = duration
2352                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2353                     for _ in range(fragment_repeat):
2354                         fragments.append({
2355                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2356                             'duration': fragment_ctx['duration'] / stream_timescale,
2357                         })
2358                         fragment_ctx['time'] += fragment_ctx['duration']
2359
2360                 format_id = []
2361                 if ism_id:
2362                     format_id.append(ism_id)
2363                 if stream_name:
2364                     format_id.append(stream_name)
2365                 format_id.append(compat_str(tbr))
2366
2367                 formats.append({
2368                     'format_id': '-'.join(format_id),
2369                     'url': ism_url,
2370                     'manifest_url': ism_url,
2371                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2372                     'width': width,
2373                     'height': height,
2374                     'tbr': tbr,
2375                     'asr': sampling_rate,
2376                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2377                     'acodec': 'none' if stream_type == 'video' else fourcc,
2378                     'protocol': 'ism',
2379                     'fragments': fragments,
2380                     '_download_params': {
2381                         'duration': duration,
2382                         'timescale': stream_timescale,
2383                         'width': width or 0,
2384                         'height': height or 0,
2385                         'fourcc': fourcc,
2386                         'codec_private_data': track.get('CodecPrivateData'),
2387                         'sampling_rate': sampling_rate,
2388                         'channels': int_or_none(track.get('Channels', 2)),
2389                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2390                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2391                     },
2392                 })
2393         return formats
2394
2395     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2396         def absolute_url(item_url):
2397             return urljoin(base_url, item_url)
2398
2399         def parse_content_type(content_type):
2400             if not content_type:
2401                 return {}
2402             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2403             if ctr:
2404                 mimetype, codecs = ctr.groups()
2405                 f = parse_codecs(codecs)
2406                 f['ext'] = mimetype2ext(mimetype)
2407                 return f
2408             return {}
2409
2410         def _media_formats(src, cur_media_type, type_info={}):
2411             full_url = absolute_url(src)
2412             ext = type_info.get('ext') or determine_ext(full_url)
2413             if ext == 'm3u8':
2414                 is_plain_url = False
2415                 formats = self._extract_m3u8_formats(
2416                     full_url, video_id, ext='mp4',
2417                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2418                     preference=preference, fatal=False)
2419             elif ext == 'mpd':
2420                 is_plain_url = False
2421                 formats = self._extract_mpd_formats(
2422                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2423             else:
2424                 is_plain_url = True
2425                 formats = [{
2426                     'url': full_url,
2427                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2428                 }]
2429             return is_plain_url, formats
2430
2431         entries = []
2432         # amp-video and amp-audio are very similar to their HTML5 counterparts
2433         # so we wll include them right here (see
2434         # https://www.ampproject.org/docs/reference/components/amp-video)
2435         media_tags = [(media_tag, media_type, '')
2436                       for media_tag, media_type
2437                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2438         media_tags.extend(re.findall(
2439             # We only allow video|audio followed by a whitespace or '>'.
2440             # Allowing more characters may end up in significant slow down (see
2441             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2442             # http://www.porntrex.com/maps/videositemap.xml).
2443             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2444         for media_tag, media_type, media_content in media_tags:
2445             media_info = {
2446                 'formats': [],
2447                 'subtitles': {},
2448             }
2449             media_attributes = extract_attributes(media_tag)
2450             src = media_attributes.get('src')
2451             if src:
2452                 _, formats = _media_formats(src, media_type)
2453                 media_info['formats'].extend(formats)
2454             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2455             if media_content:
2456                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2457                     source_attributes = extract_attributes(source_tag)
2458                     src = source_attributes.get('src')
2459                     if not src:
2460                         continue
2461                     f = parse_content_type(source_attributes.get('type'))
2462                     is_plain_url, formats = _media_formats(src, media_type, f)
2463                     if is_plain_url:
2464                         # res attribute is not standard but seen several times
2465                         # in the wild
2466                         f.update({
2467                             'height': int_or_none(source_attributes.get('res')),
2468                             'format_id': source_attributes.get('label'),
2469                         })
2470                         f.update(formats[0])
2471                         media_info['formats'].append(f)
2472                     else:
2473                         media_info['formats'].extend(formats)
2474                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2475                     track_attributes = extract_attributes(track_tag)
2476                     kind = track_attributes.get('kind')
2477                     if not kind or kind in ('subtitles', 'captions'):
2478                         src = track_attributes.get('src')
2479                         if not src:
2480                             continue
2481                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2482                         media_info['subtitles'].setdefault(lang, []).append({
2483                             'url': absolute_url(src),
2484                         })
2485             for f in media_info['formats']:
2486                 f.setdefault('http_headers', {})['Referer'] = base_url
2487             if media_info['formats'] or media_info['subtitles']:
2488                 entries.append(media_info)
2489         return entries
2490
2491     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2492         formats = []
2493         hdcore_sign = 'hdcore=3.7.0'
2494         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2495         hds_host = hosts.get('hds')
2496         if hds_host:
2497             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2498         if 'hdcore=' not in f4m_url:
2499             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2500         f4m_formats = self._extract_f4m_formats(
2501             f4m_url, video_id, f4m_id='hds', fatal=False)
2502         for entry in f4m_formats:
2503             entry.update({'extra_param_to_segment_url': hdcore_sign})
2504         formats.extend(f4m_formats)
2505         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2506         hls_host = hosts.get('hls')
2507         if hls_host:
2508             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2509         formats.extend(self._extract_m3u8_formats(
2510             m3u8_url, video_id, 'mp4', 'm3u8_native',
2511             m3u8_id='hls', fatal=False))
2512         return formats
2513
2514     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2515         query = compat_urlparse.urlparse(url).query
2516         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2517         mobj = re.search(
2518             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2519         url_base = mobj.group('url')
2520         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2521         formats = []
2522
2523         def manifest_url(manifest):
2524             m_url = '%s/%s' % (http_base_url, manifest)
2525             if query:
2526                 m_url += '?%s' % query
2527             return m_url
2528
2529         if 'm3u8' not in skip_protocols:
2530             formats.extend(self._extract_m3u8_formats(
2531                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2532                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2533         if 'f4m' not in skip_protocols:
2534             formats.extend(self._extract_f4m_formats(
2535                 manifest_url('manifest.f4m'),
2536                 video_id, f4m_id='hds', fatal=False))
2537         if 'dash' not in skip_protocols:
2538             formats.extend(self._extract_mpd_formats(
2539                 manifest_url('manifest.mpd'),
2540                 video_id, mpd_id='dash', fatal=False))
2541         if re.search(r'(?:/smil:|\.smil)', url_base):
2542             if 'smil' not in skip_protocols:
2543                 rtmp_formats = self._extract_smil_formats(
2544                     manifest_url('jwplayer.smil'),
2545                     video_id, fatal=False)
2546                 for rtmp_format in rtmp_formats:
2547                     rtsp_format = rtmp_format.copy()
2548                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2549                     del rtsp_format['play_path']
2550                     del rtsp_format['ext']
2551                     rtsp_format.update({
2552                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2553                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2554                         'protocol': 'rtsp',
2555                     })
2556                     formats.extend([rtmp_format, rtsp_format])
2557         else:
2558             for protocol in ('rtmp', 'rtsp'):
2559                 if protocol not in skip_protocols:
2560                     formats.append({
2561                         'url': '%s:%s' % (protocol, url_base),
2562                         'format_id': protocol,
2563                         'protocol': protocol,
2564                     })
2565         return formats
2566
2567     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2568         mobj = re.search(
2569             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2570             webpage)
2571         if mobj:
2572             try:
2573                 jwplayer_data = self._parse_json(mobj.group('options'),
2574                                                  video_id=video_id,
2575                                                  transform_source=transform_source)
2576             except ExtractorError:
2577                 pass
2578             else:
2579                 if isinstance(jwplayer_data, dict):
2580                     return jwplayer_data
2581
2582     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2583         jwplayer_data = self._find_jwplayer_data(
2584             webpage, video_id, transform_source=js_to_json)
2585         return self._parse_jwplayer_data(
2586             jwplayer_data, video_id, *args, **kwargs)
2587
2588     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2589                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2590         # JWPlayer backward compatibility: flattened playlists
2591         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2592         if 'playlist' not in jwplayer_data:
2593             jwplayer_data = {'playlist': [jwplayer_data]}
2594
2595         entries = []
2596
2597         # JWPlayer backward compatibility: single playlist item
2598         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2599         if not isinstance(jwplayer_data['playlist'], list):
2600             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2601
2602         for video_data in jwplayer_data['playlist']:
2603             # JWPlayer backward compatibility: flattened sources
2604             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2605             if 'sources' not in video_data:
2606                 video_data['sources'] = [video_data]
2607
2608             this_video_id = video_id or video_data['mediaid']
2609
2610             formats = self._parse_jwplayer_formats(
2611                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2612                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2613
2614             subtitles = {}
2615             tracks = video_data.get('tracks')
2616             if tracks and isinstance(tracks, list):
2617                 for track in tracks:
2618                     if not isinstance(track, dict):
2619                         continue
2620                     track_kind = track.get('kind')
2621                     if not track_kind or not isinstance(track_kind, compat_str):
2622                         continue
2623                     if track_kind.lower() not in ('captions', 'subtitles'):
2624                         continue
2625                     track_url = urljoin(base_url, track.get('file'))
2626                     if not track_url:
2627                         continue
2628                     subtitles.setdefault(track.get('label') or 'en', []).append({
2629                         'url': self._proto_relative_url(track_url)
2630                     })
2631
2632             entry = {
2633                 'id': this_video_id,
2634                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2635                 'description': video_data.get('description'),
2636                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2637                 'timestamp': int_or_none(video_data.get('pubdate')),
2638                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2639                 'subtitles': subtitles,
2640             }
2641             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2642             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2643                 entry.update({
2644                     '_type': 'url_transparent',
2645                     'url': formats[0]['url'],
2646                 })
2647             else:
2648                 self._sort_formats(formats)
2649                 entry['formats'] = formats
2650             entries.append(entry)
2651         if len(entries) == 1:
2652             return entries[0]
2653         else:
2654             return self.playlist_result(entries)
2655
2656     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2657                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2658         urls = []
2659         formats = []
2660         for source in jwplayer_sources_data:
2661             if not isinstance(source, dict):
2662                 continue
2663             source_url = urljoin(
2664                 base_url, self._proto_relative_url(source.get('file')))
2665             if not source_url or source_url in urls:
2666                 continue
2667             urls.append(source_url)
2668             source_type = source.get('type') or ''
2669             ext = mimetype2ext(source_type) or determine_ext(source_url)
2670             if source_type == 'hls' or ext == 'm3u8':
2671                 formats.extend(self._extract_m3u8_formats(
2672                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2673                     m3u8_id=m3u8_id, fatal=False))
2674             elif source_type == 'dash' or ext == 'mpd':
2675                 formats.extend(self._extract_mpd_formats(
2676                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2677             elif ext == 'smil':
2678                 formats.extend(self._extract_smil_formats(
2679                     source_url, video_id, fatal=False))
2680             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2681             elif source_type.startswith('audio') or ext in (
2682                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2683                 formats.append({
2684                     'url': source_url,
2685                     'vcodec': 'none',
2686                     'ext': ext,
2687                 })
2688             else:
2689                 height = int_or_none(source.get('height'))
2690                 if height is None:
2691                     # Often no height is provided but there is a label in
2692                     # format like "1080p", "720p SD", or 1080.
2693                     height = int_or_none(self._search_regex(
2694                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2695                         'height', default=None))
2696                 a_format = {
2697                     'url': source_url,
2698                     'width': int_or_none(source.get('width')),
2699                     'height': height,
2700                     'tbr': int_or_none(source.get('bitrate')),
2701                     'ext': ext,
2702                 }
2703                 if source_url.startswith('rtmp'):
2704                     a_format['ext'] = 'flv'
2705                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2706                     # of jwplayer.flash.swf
2707                     rtmp_url_parts = re.split(
2708                         r'((?:mp4|mp3|flv):)', source_url, 1)
2709                     if len(rtmp_url_parts) == 3:
2710                         rtmp_url, prefix, play_path = rtmp_url_parts
2711                         a_format.update({
2712                             'url': rtmp_url,
2713                             'play_path': prefix + play_path,
2714                         })
2715                     if rtmp_params:
2716                         a_format.update(rtmp_params)
2717                 formats.append(a_format)
2718         return formats
2719
2720     def _live_title(self, name):
2721         """ Generate the title for a live video """
2722         now = datetime.datetime.now()
2723         now_str = now.strftime('%Y-%m-%d %H:%M')
2724         return name + ' ' + now_str
2725
2726     def _int(self, v, name, fatal=False, **kwargs):
2727         res = int_or_none(v, **kwargs)
2728         if 'get_attr' in kwargs:
2729             print(getattr(v, kwargs['get_attr']))
2730         if res is None:
2731             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2732             if fatal:
2733                 raise ExtractorError(msg)
2734             else:
2735                 self._downloader.report_warning(msg)
2736         return res
2737
2738     def _float(self, v, name, fatal=False, **kwargs):
2739         res = float_or_none(v, **kwargs)
2740         if res is None:
2741             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2742             if fatal:
2743                 raise ExtractorError(msg)
2744             else:
2745                 self._downloader.report_warning(msg)
2746         return res
2747
2748     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2749                     path='/', secure=False, discard=False, rest={}, **kwargs):
2750         cookie = compat_cookiejar.Cookie(
2751             0, name, value, port, port is not None, domain, True,
2752             domain.startswith('.'), path, True, secure, expire_time,
2753             discard, None, None, rest)
2754         self._downloader.cookiejar.set_cookie(cookie)
2755
2756     def _get_cookies(self, url):
2757         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2758         req = sanitized_Request(url)
2759         self._downloader.cookiejar.add_cookie_header(req)
2760         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2761
2762     def get_testcases(self, include_onlymatching=False):
2763         t = getattr(self, '_TEST', None)
2764         if t:
2765             assert not hasattr(self, '_TESTS'), \
2766                 '%s has _TEST and _TESTS' % type(self).__name__
2767             tests = [t]
2768         else:
2769             tests = getattr(self, '_TESTS', [])
2770         for t in tests:
2771             if not include_onlymatching and t.get('only_matching', False):
2772                 continue
2773             t['name'] = type(self).__name__[:-len('IE')]
2774             yield t
2775
2776     def is_suitable(self, age_limit):
2777         """ Test whether the extractor is generally suitable for the given
2778         age limit (i.e. pornographic sites are not, all others usually are) """
2779
2780         any_restricted = False
2781         for tc in self.get_testcases(include_onlymatching=False):
2782             if tc.get('playlist', []):
2783                 tc = tc['playlist'][0]
2784             is_restricted = age_restricted(
2785                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2786             if not is_restricted:
2787                 return True
2788             any_restricted = any_restricted or is_restricted
2789         return not any_restricted
2790
2791     def extract_subtitles(self, *args, **kwargs):
2792         if (self._downloader.params.get('writesubtitles', False) or
2793                 self._downloader.params.get('listsubtitles')):
2794             return self._get_subtitles(*args, **kwargs)
2795         return {}
2796
2797     def _get_subtitles(self, *args, **kwargs):
2798         raise NotImplementedError('This method must be implemented by subclasses')
2799
2800     @staticmethod
2801     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2802         """ Merge subtitle items for one language. Items with duplicated URLs
2803         will be dropped. """
2804         list1_urls = set([item['url'] for item in subtitle_list1])
2805         ret = list(subtitle_list1)
2806         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2807         return ret
2808
2809     @classmethod
2810     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2811         """ Merge two subtitle dictionaries, language by language. """
2812         ret = dict(subtitle_dict1)
2813         for lang in subtitle_dict2:
2814             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2815         return ret
2816
2817     def extract_automatic_captions(self, *args, **kwargs):
2818         if (self._downloader.params.get('writeautomaticsub', False) or
2819                 self._downloader.params.get('listsubtitles')):
2820             return self._get_automatic_captions(*args, **kwargs)
2821         return {}
2822
2823     def _get_automatic_captions(self, *args, **kwargs):
2824         raise NotImplementedError('This method must be implemented by subclasses')
2825
2826     def mark_watched(self, *args, **kwargs):
2827         if (self._downloader.params.get('mark_watched', False) and
2828                 (self._get_login_info()[0] is not None or
2829                     self._downloader.params.get('cookiefile') is not None)):
2830             self._mark_watched(*args, **kwargs)
2831
2832     def _mark_watched(self, *args, **kwargs):
2833         raise NotImplementedError('This method must be implemented by subclasses')
2834
2835     def geo_verification_headers(self):
2836         headers = {}
2837         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2838         if geo_verification_proxy:
2839             headers['Ytdl-request-proxy'] = geo_verification_proxy
2840         return headers
2841
2842     def _generic_id(self, url):
2843         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2844
2845     def _generic_title(self, url):
2846         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2847
2848
2849 class SearchInfoExtractor(InfoExtractor):
2850     """
2851     Base class for paged search queries extractors.
2852     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2853     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2854     """
2855
2856     @classmethod
2857     def _make_valid_url(cls):
2858         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2859
2860     @classmethod
2861     def suitable(cls, url):
2862         return re.match(cls._make_valid_url(), url) is not None
2863
2864     def _real_extract(self, query):
2865         mobj = re.match(self._make_valid_url(), query)
2866         if mobj is None:
2867             raise ExtractorError('Invalid search query "%s"' % query)
2868
2869         prefix = mobj.group('prefix')
2870         query = mobj.group('query')
2871         if prefix == '':
2872             return self._get_n_results(query, 1)
2873         elif prefix == 'all':
2874             return self._get_n_results(query, self._MAX_RESULTS)
2875         else:
2876             n = int(prefix)
2877             if n <= 0:
2878                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2879             elif n > self._MAX_RESULTS:
2880                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2881                 n = self._MAX_RESULTS
2882             return self._get_n_results(query, n)
2883
2884     def _get_n_results(self, query, n):
2885         """Get a specified number of results for a query"""
2886         raise NotImplementedError('This method must be implemented by subclasses')
2887
2888     @property
2889     def SEARCH_KEY(self):
2890         return self._SEARCH_KEY