[extractor/common] Introduce channel meta fields
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     xpath_element,
73     xpath_text,
74     xpath_with_ns,
75 )
76
77
78 class InfoExtractor(object):
79     """Information Extractor class.
80
81     Information extractors are the classes that, given a URL, extract
82     information about the video (or videos) the URL refers to. This
83     information includes the real video URL, the video title, author and
84     others. The information is stored in a dictionary which is then
85     passed to the YoutubeDL. The YoutubeDL processes this
86     information possibly downloading the video to the file system, among
87     other possible outcomes.
88
89     The type field determines the type of the result.
90     By far the most common value (and the default if _type is missing) is
91     "video", which indicates a single video.
92
93     For a video, the dictionaries must include the following fields:
94
95     id:             Video identifier.
96     title:          Video title, unescaped.
97
98     Additionally, it must contain either a formats entry or a url one:
99
100     formats:        A list of dictionaries for each format available, ordered
101                     from worst to best quality.
102
103                     Potential fields:
104                     * url        Mandatory. The URL of the video file
105                     * manifest_url
106                                  The URL of the manifest file in case of
107                                  fragmented media (DASH, hls, hds)
108                     * ext        Will be calculated from URL if missing
109                     * format     A human-readable description of the format
110                                  ("mp4 container with h264/opus").
111                                  Calculated from the format_id, width, height.
112                                  and format_note fields if missing.
113                     * format_id  A short description of the format
114                                  ("mp4_h264_opus" or "19").
115                                 Technically optional, but strongly recommended.
116                     * format_note Additional info about the format
117                                  ("3D" or "DASH video")
118                     * width      Width of the video, if known
119                     * height     Height of the video, if known
120                     * resolution Textual description of width and height
121                     * tbr        Average bitrate of audio and video in KBit/s
122                     * abr        Average audio bitrate in KBit/s
123                     * acodec     Name of the audio codec in use
124                     * asr        Audio sampling rate in Hertz
125                     * vbr        Average video bitrate in KBit/s
126                     * fps        Frame rate
127                     * vcodec     Name of the video codec in use
128                     * container  Name of the container format
129                     * filesize   The number of bytes, if known in advance
130                     * filesize_approx  An estimate for the number of bytes
131                     * player_url SWF Player URL (used for rtmpdump).
132                     * protocol   The protocol that will be used for the actual
133                                  download, lower-case.
134                                  "http", "https", "rtsp", "rtmp", "rtmpe",
135                                  "m3u8", "m3u8_native" or "http_dash_segments".
136                     * fragment_base_url
137                                  Base URL for fragments. Each fragment's path
138                                  value (if present) will be relative to
139                                  this URL.
140                     * fragments  A list of fragments of a fragmented media.
141                                  Each fragment entry must contain either an url
142                                  or a path. If an url is present it should be
143                                  considered by a client. Otherwise both path and
144                                  fragment_base_url must be present. Here is
145                                  the list of all potential fields:
146                                  * "url" - fragment's URL
147                                  * "path" - fragment's path relative to
148                                             fragment_base_url
149                                  * "duration" (optional, int or float)
150                                  * "filesize" (optional, int)
151                     * preference Order number of this format. If this field is
152                                  present and not None, the formats get sorted
153                                  by this field, regardless of all other values.
154                                  -1 for default (order by other properties),
155                                  -2 or smaller for less than default.
156                                  < -1000 to hide the format (if there is
157                                     another one which is strictly better)
158                     * language   Language code, e.g. "de" or "en-US".
159                     * language_preference  Is this in the language mentioned in
160                                  the URL?
161                                  10 if it's what the URL is about,
162                                  -1 for default (don't know),
163                                  -10 otherwise, other values reserved for now.
164                     * quality    Order number of the video quality of this
165                                  format, irrespective of the file format.
166                                  -1 for default (order by other properties),
167                                  -2 or smaller for less than default.
168                     * source_preference  Order number for this video source
169                                   (quality takes higher priority)
170                                  -1 for default (order by other properties),
171                                  -2 or smaller for less than default.
172                     * http_headers  A dictionary of additional HTTP headers
173                                  to add to the request.
174                     * stretched_ratio  If given and not 1, indicates that the
175                                  video's pixels are not square.
176                                  width : height ratio as float.
177                     * no_resume  The server does not support resuming the
178                                  (HTTP or RTMP) download. Boolean.
179                     * downloader_options  A dictionary of downloader options as
180                                  described in FileDownloader
181
182     url:            Final video URL.
183     ext:            Video filename extension.
184     format:         The video format, defaults to ext (used for --get-format)
185     player_url:     SWF Player URL (used for rtmpdump).
186
187     The following fields are optional:
188
189     alt_title:      A secondary title of the video.
190     display_id      An alternative identifier for the video, not necessarily
191                     unique, but available before title. Typically, id is
192                     something like "4234987", title "Dancing naked mole rats",
193                     and display_id "dancing-naked-mole-rats"
194     thumbnails:     A list of dictionaries, with the following entries:
195                         * "id" (optional, string) - Thumbnail format ID
196                         * "url"
197                         * "preference" (optional, int) - quality of the image
198                         * "width" (optional, int)
199                         * "height" (optional, int)
200                         * "resolution" (optional, string "{width}x{height"},
201                                         deprecated)
202                         * "filesize" (optional, int)
203     thumbnail:      Full URL to a video thumbnail image.
204     description:    Full video description.
205     uploader:       Full name of the video uploader.
206     license:        License name the video is licensed under.
207     creator:        The creator of the video.
208     release_date:   The date (YYYYMMDD) when the video was released.
209     timestamp:      UNIX timestamp of the moment the video became available.
210     upload_date:    Video upload date (YYYYMMDD).
211                     If not explicitly set, calculated from timestamp.
212     uploader_id:    Nickname or id of the video uploader.
213     uploader_url:   Full URL to a personal webpage of the video uploader.
214     channel:        Full name of the channel the video is uploaded on.
215                     Note that channel fields may or may noy repeat uploader
216                     fields. This depends on a particular extractor.
217     channel_id:     Id of the channel.
218     channel_url:    Full URL to a channel webpage.
219     location:       Physical location where the video was filmed.
220     subtitles:      The available subtitles as a dictionary in the format
221                     {tag: subformats}. "tag" is usually a language code, and
222                     "subformats" is a list sorted from lower to higher
223                     preference, each element is a dictionary with the "ext"
224                     entry and one of:
225                         * "data": The subtitles file contents
226                         * "url": A URL pointing to the subtitles file
227                     "ext" will be calculated from URL if missing
228     automatic_captions: Like 'subtitles', used by the YoutubeIE for
229                     automatically generated captions
230     duration:       Length of the video in seconds, as an integer or float.
231     view_count:     How many users have watched the video on the platform.
232     like_count:     Number of positive ratings of the video
233     dislike_count:  Number of negative ratings of the video
234     repost_count:   Number of reposts of the video
235     average_rating: Average rating give by users, the scale used depends on the webpage
236     comment_count:  Number of comments on the video
237     comments:       A list of comments, each with one or more of the following
238                     properties (all but one of text or html optional):
239                         * "author" - human-readable name of the comment author
240                         * "author_id" - user ID of the comment author
241                         * "id" - Comment ID
242                         * "html" - Comment as HTML
243                         * "text" - Plain text of the comment
244                         * "timestamp" - UNIX timestamp of comment
245                         * "parent" - ID of the comment this one is replying to.
246                                      Set to "root" to indicate that this is a
247                                      comment to the original video.
248     age_limit:      Age restriction for the video, as an integer (years)
249     webpage_url:    The URL to the video webpage, if given to youtube-dl it
250                     should allow to get the same result again. (It will be set
251                     by YoutubeDL if it's missing)
252     categories:     A list of categories that the video falls in, for example
253                     ["Sports", "Berlin"]
254     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
255     is_live:        True, False, or None (=unknown). Whether this video is a
256                     live stream that goes on instead of a fixed-length video.
257     start_time:     Time in seconds where the reproduction should start, as
258                     specified in the URL.
259     end_time:       Time in seconds where the reproduction should end, as
260                     specified in the URL.
261     chapters:       A list of dictionaries, with the following entries:
262                         * "start_time" - The start time of the chapter in seconds
263                         * "end_time" - The end time of the chapter in seconds
264                         * "title" (optional, string)
265
266     The following fields should only be used when the video belongs to some logical
267     chapter or section:
268
269     chapter:        Name or title of the chapter the video belongs to.
270     chapter_number: Number of the chapter the video belongs to, as an integer.
271     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
272
273     The following fields should only be used when the video is an episode of some
274     series, programme or podcast:
275
276     series:         Title of the series or programme the video episode belongs to.
277     season:         Title of the season the video episode belongs to.
278     season_number:  Number of the season the video episode belongs to, as an integer.
279     season_id:      Id of the season the video episode belongs to, as a unicode string.
280     episode:        Title of the video episode. Unlike mandatory video title field,
281                     this field should denote the exact title of the video episode
282                     without any kind of decoration.
283     episode_number: Number of the video episode within a season, as an integer.
284     episode_id:     Id of the video episode, as a unicode string.
285
286     The following fields should only be used when the media is a track or a part of
287     a music album:
288
289     track:          Title of the track.
290     track_number:   Number of the track within an album or a disc, as an integer.
291     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
292                     as a unicode string.
293     artist:         Artist(s) of the track.
294     genre:          Genre(s) of the track.
295     album:          Title of the album the track belongs to.
296     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
297     album_artist:   List of all artists appeared on the album (e.g.
298                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
299                     and compilations).
300     disc_number:    Number of the disc or other physical medium the track belongs to,
301                     as an integer.
302     release_year:   Year (YYYY) when the album was released.
303
304     Unless mentioned otherwise, the fields should be Unicode strings.
305
306     Unless mentioned otherwise, None is equivalent to absence of information.
307
308
309     _type "playlist" indicates multiple videos.
310     There must be a key "entries", which is a list, an iterable, or a PagedList
311     object, each element of which is a valid dictionary by this specification.
312
313     Additionally, playlists can have "id", "title", "description", "uploader",
314     "uploader_id", "uploader_url" attributes with the same semantics as videos
315     (see above).
316
317
318     _type "multi_video" indicates that there are multiple videos that
319     form a single show, for examples multiple acts of an opera or TV episode.
320     It must have an entries key like a playlist and contain all the keys
321     required for a video at the same time.
322
323
324     _type "url" indicates that the video must be extracted from another
325     location, possibly by a different extractor. Its only required key is:
326     "url" - the next URL to extract.
327     The key "ie_key" can be set to the class name (minus the trailing "IE",
328     e.g. "Youtube") if the extractor class is known in advance.
329     Additionally, the dictionary may have any properties of the resolved entity
330     known in advance, for example "title" if the title of the referred video is
331     known ahead of time.
332
333
334     _type "url_transparent" entities have the same specification as "url", but
335     indicate that the given additional information is more precise than the one
336     associated with the resolved URL.
337     This is useful when a site employs a video service that hosts the video and
338     its technical metadata, but that video service does not embed a useful
339     title, description etc.
340
341
342     Subclasses of this one should re-define the _real_initialize() and
343     _real_extract() methods and define a _VALID_URL regexp.
344     Probably, they should also be added to the list of extractors.
345
346     _GEO_BYPASS attribute may be set to False in order to disable
347     geo restriction bypass mechanisms for a particular extractor.
348     Though it won't disable explicit geo restriction bypass based on
349     country code provided with geo_bypass_country.
350
351     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
352     countries for this extractor. One of these countries will be used by
353     geo restriction bypass mechanism right away in order to bypass
354     geo restriction, of course, if the mechanism is not disabled.
355
356     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
357     IP blocks in CIDR notation for this extractor. One of these IP blocks
358     will be used by geo restriction bypass mechanism similarly
359     to _GEO_COUNTRIES.
360
361     Finally, the _WORKING attribute should be set to False for broken IEs
362     in order to warn the users and skip the tests.
363     """
364
365     _ready = False
366     _downloader = None
367     _x_forwarded_for_ip = None
368     _GEO_BYPASS = True
369     _GEO_COUNTRIES = None
370     _GEO_IP_BLOCKS = None
371     _WORKING = True
372
373     def __init__(self, downloader=None):
374         """Constructor. Receives an optional downloader."""
375         self._ready = False
376         self._x_forwarded_for_ip = None
377         self.set_downloader(downloader)
378
379     @classmethod
380     def suitable(cls, url):
381         """Receives a URL and returns True if suitable for this IE."""
382
383         # This does not use has/getattr intentionally - we want to know whether
384         # we have cached the regexp for *this* class, whereas getattr would also
385         # match the superclass
386         if '_VALID_URL_RE' not in cls.__dict__:
387             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
388         return cls._VALID_URL_RE.match(url) is not None
389
390     @classmethod
391     def _match_id(cls, url):
392         if '_VALID_URL_RE' not in cls.__dict__:
393             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
394         m = cls._VALID_URL_RE.match(url)
395         assert m
396         return compat_str(m.group('id'))
397
398     @classmethod
399     def working(cls):
400         """Getter method for _WORKING."""
401         return cls._WORKING
402
403     def initialize(self):
404         """Initializes an instance (authentication, etc)."""
405         self._initialize_geo_bypass({
406             'countries': self._GEO_COUNTRIES,
407             'ip_blocks': self._GEO_IP_BLOCKS,
408         })
409         if not self._ready:
410             self._real_initialize()
411             self._ready = True
412
413     def _initialize_geo_bypass(self, geo_bypass_context):
414         """
415         Initialize geo restriction bypass mechanism.
416
417         This method is used to initialize geo bypass mechanism based on faking
418         X-Forwarded-For HTTP header. A random country from provided country list
419         is selected and a random IP belonging to this country is generated. This
420         IP will be passed as X-Forwarded-For HTTP header in all subsequent
421         HTTP requests.
422
423         This method will be used for initial geo bypass mechanism initialization
424         during the instance initialization with _GEO_COUNTRIES and
425         _GEO_IP_BLOCKS.
426
427         You may also manually call it from extractor's code if geo bypass
428         information is not available beforehand (e.g. obtained during
429         extraction) or due to some other reason. In this case you should pass
430         this information in geo bypass context passed as first argument. It may
431         contain following fields:
432
433         countries:  List of geo unrestricted countries (similar
434                     to _GEO_COUNTRIES)
435         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
436                     (similar to _GEO_IP_BLOCKS)
437
438         """
439         if not self._x_forwarded_for_ip:
440
441             # Geo bypass mechanism is explicitly disabled by user
442             if not self._downloader.params.get('geo_bypass', True):
443                 return
444
445             if not geo_bypass_context:
446                 geo_bypass_context = {}
447
448             # Backward compatibility: previously _initialize_geo_bypass
449             # expected a list of countries, some 3rd party code may still use
450             # it this way
451             if isinstance(geo_bypass_context, (list, tuple)):
452                 geo_bypass_context = {
453                     'countries': geo_bypass_context,
454                 }
455
456             # The whole point of geo bypass mechanism is to fake IP
457             # as X-Forwarded-For HTTP header based on some IP block or
458             # country code.
459
460             # Path 1: bypassing based on IP block in CIDR notation
461
462             # Explicit IP block specified by user, use it right away
463             # regardless of whether extractor is geo bypassable or not
464             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
465
466             # Otherwise use random IP block from geo bypass context but only
467             # if extractor is known as geo bypassable
468             if not ip_block:
469                 ip_blocks = geo_bypass_context.get('ip_blocks')
470                 if self._GEO_BYPASS and ip_blocks:
471                     ip_block = random.choice(ip_blocks)
472
473             if ip_block:
474                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
475                 if self._downloader.params.get('verbose', False):
476                     self._downloader.to_screen(
477                         '[debug] Using fake IP %s as X-Forwarded-For.'
478                         % self._x_forwarded_for_ip)
479                 return
480
481             # Path 2: bypassing based on country code
482
483             # Explicit country code specified by user, use it right away
484             # regardless of whether extractor is geo bypassable or not
485             country = self._downloader.params.get('geo_bypass_country', None)
486
487             # Otherwise use random country code from geo bypass context but
488             # only if extractor is known as geo bypassable
489             if not country:
490                 countries = geo_bypass_context.get('countries')
491                 if self._GEO_BYPASS and countries:
492                     country = random.choice(countries)
493
494             if country:
495                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
496                 if self._downloader.params.get('verbose', False):
497                     self._downloader.to_screen(
498                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
499                         % (self._x_forwarded_for_ip, country.upper()))
500
501     def extract(self, url):
502         """Extracts URL information and returns it in list of dicts."""
503         try:
504             for _ in range(2):
505                 try:
506                     self.initialize()
507                     ie_result = self._real_extract(url)
508                     if self._x_forwarded_for_ip:
509                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
510                     return ie_result
511                 except GeoRestrictedError as e:
512                     if self.__maybe_fake_ip_and_retry(e.countries):
513                         continue
514                     raise
515         except ExtractorError:
516             raise
517         except compat_http_client.IncompleteRead as e:
518             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
519         except (KeyError, StopIteration) as e:
520             raise ExtractorError('An extractor error has occurred.', cause=e)
521
522     def __maybe_fake_ip_and_retry(self, countries):
523         if (not self._downloader.params.get('geo_bypass_country', None) and
524                 self._GEO_BYPASS and
525                 self._downloader.params.get('geo_bypass', True) and
526                 not self._x_forwarded_for_ip and
527                 countries):
528             country_code = random.choice(countries)
529             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
530             if self._x_forwarded_for_ip:
531                 self.report_warning(
532                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
533                     % (self._x_forwarded_for_ip, country_code.upper()))
534                 return True
535         return False
536
537     def set_downloader(self, downloader):
538         """Sets the downloader for this IE."""
539         self._downloader = downloader
540
541     def _real_initialize(self):
542         """Real initialization process. Redefine in subclasses."""
543         pass
544
545     def _real_extract(self, url):
546         """Real extraction process. Redefine in subclasses."""
547         pass
548
549     @classmethod
550     def ie_key(cls):
551         """A string for getting the InfoExtractor with get_info_extractor"""
552         return compat_str(cls.__name__[:-2])
553
554     @property
555     def IE_NAME(self):
556         return compat_str(type(self).__name__[:-2])
557
558     @staticmethod
559     def __can_accept_status_code(err, expected_status):
560         assert isinstance(err, compat_urllib_error.HTTPError)
561         if expected_status is None:
562             return False
563         if isinstance(expected_status, compat_integer_types):
564             return err.code == expected_status
565         elif isinstance(expected_status, (list, tuple)):
566             return err.code in expected_status
567         elif callable(expected_status):
568             return expected_status(err.code) is True
569         else:
570             assert False
571
572     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
573         """
574         Return the response handle.
575
576         See _download_webpage docstring for arguments specification.
577         """
578         if note is None:
579             self.report_download_webpage(video_id)
580         elif note is not False:
581             if video_id is None:
582                 self.to_screen('%s' % (note,))
583             else:
584                 self.to_screen('%s: %s' % (video_id, note))
585
586         # Some sites check X-Forwarded-For HTTP header in order to figure out
587         # the origin of the client behind proxy. This allows bypassing geo
588         # restriction by faking this header's value to IP that belongs to some
589         # geo unrestricted country. We will do so once we encounter any
590         # geo restriction error.
591         if self._x_forwarded_for_ip:
592             if 'X-Forwarded-For' not in headers:
593                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
594
595         if isinstance(url_or_request, compat_urllib_request.Request):
596             url_or_request = update_Request(
597                 url_or_request, data=data, headers=headers, query=query)
598         else:
599             if query:
600                 url_or_request = update_url_query(url_or_request, query)
601             if data is not None or headers:
602                 url_or_request = sanitized_Request(url_or_request, data, headers)
603         try:
604             return self._downloader.urlopen(url_or_request)
605         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
606             if isinstance(err, compat_urllib_error.HTTPError):
607                 if self.__can_accept_status_code(err, expected_status):
608                     return err.fp
609
610             if errnote is False:
611                 return False
612             if errnote is None:
613                 errnote = 'Unable to download webpage'
614
615             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
616             if fatal:
617                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
618             else:
619                 self._downloader.report_warning(errmsg)
620                 return False
621
622     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
623         """
624         Return a tuple (page content as string, URL handle).
625
626         See _download_webpage docstring for arguments specification.
627         """
628         # Strip hashes from the URL (#1038)
629         if isinstance(url_or_request, (compat_str, str)):
630             url_or_request = url_or_request.partition('#')[0]
631
632         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
633         if urlh is False:
634             assert not fatal
635             return False
636         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
637         return (content, urlh)
638
639     @staticmethod
640     def _guess_encoding_from_content(content_type, webpage_bytes):
641         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
642         if m:
643             encoding = m.group(1)
644         else:
645             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
646                           webpage_bytes[:1024])
647             if m:
648                 encoding = m.group(1).decode('ascii')
649             elif webpage_bytes.startswith(b'\xff\xfe'):
650                 encoding = 'utf-16'
651             else:
652                 encoding = 'utf-8'
653
654         return encoding
655
656     def __check_blocked(self, content):
657         first_block = content[:512]
658         if ('<title>Access to this site is blocked</title>' in content and
659                 'Websense' in first_block):
660             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
661             blocked_iframe = self._html_search_regex(
662                 r'<iframe src="([^"]+)"', content,
663                 'Websense information URL', default=None)
664             if blocked_iframe:
665                 msg += ' Visit %s for more details' % blocked_iframe
666             raise ExtractorError(msg, expected=True)
667         if '<title>The URL you requested has been blocked</title>' in first_block:
668             msg = (
669                 'Access to this webpage has been blocked by Indian censorship. '
670                 'Use a VPN or proxy server (with --proxy) to route around it.')
671             block_msg = self._html_search_regex(
672                 r'</h1><p>(.*?)</p>',
673                 content, 'block message', default=None)
674             if block_msg:
675                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
676             raise ExtractorError(msg, expected=True)
677         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
678                 'blocklist.rkn.gov.ru' in content):
679             raise ExtractorError(
680                 'Access to this webpage has been blocked by decision of the Russian government. '
681                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
682                 expected=True)
683
684     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
685         content_type = urlh.headers.get('Content-Type', '')
686         webpage_bytes = urlh.read()
687         if prefix is not None:
688             webpage_bytes = prefix + webpage_bytes
689         if not encoding:
690             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
691         if self._downloader.params.get('dump_intermediate_pages', False):
692             self.to_screen('Dumping request to ' + urlh.geturl())
693             dump = base64.b64encode(webpage_bytes).decode('ascii')
694             self._downloader.to_screen(dump)
695         if self._downloader.params.get('write_pages', False):
696             basen = '%s_%s' % (video_id, urlh.geturl())
697             if len(basen) > 240:
698                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
699                 basen = basen[:240 - len(h)] + h
700             raw_filename = basen + '.dump'
701             filename = sanitize_filename(raw_filename, restricted=True)
702             self.to_screen('Saving request to ' + filename)
703             # Working around MAX_PATH limitation on Windows (see
704             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
705             if compat_os_name == 'nt':
706                 absfilepath = os.path.abspath(filename)
707                 if len(absfilepath) > 259:
708                     filename = '\\\\?\\' + absfilepath
709             with open(filename, 'wb') as outf:
710                 outf.write(webpage_bytes)
711
712         try:
713             content = webpage_bytes.decode(encoding, 'replace')
714         except LookupError:
715             content = webpage_bytes.decode('utf-8', 'replace')
716
717         self.__check_blocked(content)
718
719         return content
720
721     def _download_webpage(
722             self, url_or_request, video_id, note=None, errnote=None,
723             fatal=True, tries=1, timeout=5, encoding=None, data=None,
724             headers={}, query={}, expected_status=None):
725         """
726         Return the data of the page as a string.
727
728         Arguments:
729         url_or_request -- plain text URL as a string or
730             a compat_urllib_request.Requestobject
731         video_id -- Video/playlist/item identifier (string)
732
733         Keyword arguments:
734         note -- note printed before downloading (string)
735         errnote -- note printed in case of an error (string)
736         fatal -- flag denoting whether error should be considered fatal,
737             i.e. whether it should cause ExtractionError to be raised,
738             otherwise a warning will be reported and extraction continued
739         tries -- number of tries
740         timeout -- sleep interval between tries
741         encoding -- encoding for a page content decoding, guessed automatically
742             when not explicitly specified
743         data -- POST data (bytes)
744         headers -- HTTP headers (dict)
745         query -- URL query (dict)
746         expected_status -- allows to accept failed HTTP requests (non 2xx
747             status code) by explicitly specifying a set of accepted status
748             codes. Can be any of the following entities:
749                 - an integer type specifying an exact failed status code to
750                   accept
751                 - a list or a tuple of integer types specifying a list of
752                   failed status codes to accept
753                 - a callable accepting an actual failed status code and
754                   returning True if it should be accepted
755             Note that this argument does not affect success status codes (2xx)
756             which are always accepted.
757         """
758
759         success = False
760         try_count = 0
761         while success is False:
762             try:
763                 res = self._download_webpage_handle(
764                     url_or_request, video_id, note, errnote, fatal,
765                     encoding=encoding, data=data, headers=headers, query=query,
766                     expected_status=expected_status)
767                 success = True
768             except compat_http_client.IncompleteRead as e:
769                 try_count += 1
770                 if try_count >= tries:
771                     raise e
772                 self._sleep(timeout, video_id)
773         if res is False:
774             return res
775         else:
776             content, _ = res
777             return content
778
779     def _download_xml_handle(
780             self, url_or_request, video_id, note='Downloading XML',
781             errnote='Unable to download XML', transform_source=None,
782             fatal=True, encoding=None, data=None, headers={}, query={},
783             expected_status=None):
784         """
785         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
786
787         See _download_webpage docstring for arguments specification.
788         """
789         res = self._download_webpage_handle(
790             url_or_request, video_id, note, errnote, fatal=fatal,
791             encoding=encoding, data=data, headers=headers, query=query,
792             expected_status=expected_status)
793         if res is False:
794             return res
795         xml_string, urlh = res
796         return self._parse_xml(
797             xml_string, video_id, transform_source=transform_source,
798             fatal=fatal), urlh
799
800     def _download_xml(
801             self, url_or_request, video_id,
802             note='Downloading XML', errnote='Unable to download XML',
803             transform_source=None, fatal=True, encoding=None,
804             data=None, headers={}, query={}, expected_status=None):
805         """
806         Return the xml as an xml.etree.ElementTree.Element.
807
808         See _download_webpage docstring for arguments specification.
809         """
810         res = self._download_xml_handle(
811             url_or_request, video_id, note=note, errnote=errnote,
812             transform_source=transform_source, fatal=fatal, encoding=encoding,
813             data=data, headers=headers, query=query,
814             expected_status=expected_status)
815         return res if res is False else res[0]
816
817     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
818         if transform_source:
819             xml_string = transform_source(xml_string)
820         try:
821             return compat_etree_fromstring(xml_string.encode('utf-8'))
822         except compat_xml_parse_error as ve:
823             errmsg = '%s: Failed to parse XML ' % video_id
824             if fatal:
825                 raise ExtractorError(errmsg, cause=ve)
826             else:
827                 self.report_warning(errmsg + str(ve))
828
829     def _download_json_handle(
830             self, url_or_request, video_id, note='Downloading JSON metadata',
831             errnote='Unable to download JSON metadata', transform_source=None,
832             fatal=True, encoding=None, data=None, headers={}, query={},
833             expected_status=None):
834         """
835         Return a tuple (JSON object, URL handle).
836
837         See _download_webpage docstring for arguments specification.
838         """
839         res = self._download_webpage_handle(
840             url_or_request, video_id, note, errnote, fatal=fatal,
841             encoding=encoding, data=data, headers=headers, query=query,
842             expected_status=expected_status)
843         if res is False:
844             return res
845         json_string, urlh = res
846         return self._parse_json(
847             json_string, video_id, transform_source=transform_source,
848             fatal=fatal), urlh
849
850     def _download_json(
851             self, url_or_request, video_id, note='Downloading JSON metadata',
852             errnote='Unable to download JSON metadata', transform_source=None,
853             fatal=True, encoding=None, data=None, headers={}, query={},
854             expected_status=None):
855         """
856         Return the JSON object as a dict.
857
858         See _download_webpage docstring for arguments specification.
859         """
860         res = self._download_json_handle(
861             url_or_request, video_id, note=note, errnote=errnote,
862             transform_source=transform_source, fatal=fatal, encoding=encoding,
863             data=data, headers=headers, query=query,
864             expected_status=expected_status)
865         return res if res is False else res[0]
866
867     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
868         if transform_source:
869             json_string = transform_source(json_string)
870         try:
871             return json.loads(json_string)
872         except ValueError as ve:
873             errmsg = '%s: Failed to parse JSON ' % video_id
874             if fatal:
875                 raise ExtractorError(errmsg, cause=ve)
876             else:
877                 self.report_warning(errmsg + str(ve))
878
879     def report_warning(self, msg, video_id=None):
880         idstr = '' if video_id is None else '%s: ' % video_id
881         self._downloader.report_warning(
882             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
883
884     def to_screen(self, msg):
885         """Print msg to screen, prefixing it with '[ie_name]'"""
886         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
887
888     def report_extraction(self, id_or_name):
889         """Report information extraction."""
890         self.to_screen('%s: Extracting information' % id_or_name)
891
892     def report_download_webpage(self, video_id):
893         """Report webpage download."""
894         self.to_screen('%s: Downloading webpage' % video_id)
895
896     def report_age_confirmation(self):
897         """Report attempt to confirm age."""
898         self.to_screen('Confirming age')
899
900     def report_login(self):
901         """Report attempt to log in."""
902         self.to_screen('Logging in')
903
904     @staticmethod
905     def raise_login_required(msg='This video is only available for registered users'):
906         raise ExtractorError(
907             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
908             expected=True)
909
910     @staticmethod
911     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
912         raise GeoRestrictedError(msg, countries=countries)
913
914     # Methods for following #608
915     @staticmethod
916     def url_result(url, ie=None, video_id=None, video_title=None):
917         """Returns a URL that points to a page that should be processed"""
918         # TODO: ie should be the class used for getting the info
919         video_info = {'_type': 'url',
920                       'url': url,
921                       'ie_key': ie}
922         if video_id is not None:
923             video_info['id'] = video_id
924         if video_title is not None:
925             video_info['title'] = video_title
926         return video_info
927
928     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
929         urls = orderedSet(
930             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
931             for m in matches)
932         return self.playlist_result(
933             urls, playlist_id=playlist_id, playlist_title=playlist_title)
934
935     @staticmethod
936     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
937         """Returns a playlist"""
938         video_info = {'_type': 'playlist',
939                       'entries': entries}
940         if playlist_id:
941             video_info['id'] = playlist_id
942         if playlist_title:
943             video_info['title'] = playlist_title
944         if playlist_description:
945             video_info['description'] = playlist_description
946         return video_info
947
948     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
949         """
950         Perform a regex search on the given string, using a single or a list of
951         patterns returning the first matching group.
952         In case of failure return a default value or raise a WARNING or a
953         RegexNotFoundError, depending on fatal, specifying the field name.
954         """
955         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
956             mobj = re.search(pattern, string, flags)
957         else:
958             for p in pattern:
959                 mobj = re.search(p, string, flags)
960                 if mobj:
961                     break
962
963         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
964             _name = '\033[0;34m%s\033[0m' % name
965         else:
966             _name = name
967
968         if mobj:
969             if group is None:
970                 # return the first matching group
971                 return next(g for g in mobj.groups() if g is not None)
972             else:
973                 return mobj.group(group)
974         elif default is not NO_DEFAULT:
975             return default
976         elif fatal:
977             raise RegexNotFoundError('Unable to extract %s' % _name)
978         else:
979             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
980             return None
981
982     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
983         """
984         Like _search_regex, but strips HTML tags and unescapes entities.
985         """
986         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
987         if res:
988             return clean_html(res).strip()
989         else:
990             return res
991
992     def _get_netrc_login_info(self, netrc_machine=None):
993         username = None
994         password = None
995         netrc_machine = netrc_machine or self._NETRC_MACHINE
996
997         if self._downloader.params.get('usenetrc', False):
998             try:
999                 info = netrc.netrc().authenticators(netrc_machine)
1000                 if info is not None:
1001                     username = info[0]
1002                     password = info[2]
1003                 else:
1004                     raise netrc.NetrcParseError(
1005                         'No authenticators for %s' % netrc_machine)
1006             except (IOError, netrc.NetrcParseError) as err:
1007                 self._downloader.report_warning(
1008                     'parsing .netrc: %s' % error_to_compat_str(err))
1009
1010         return username, password
1011
1012     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1013         """
1014         Get the login info as (username, password)
1015         First look for the manually specified credentials using username_option
1016         and password_option as keys in params dictionary. If no such credentials
1017         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1018         value.
1019         If there's no info available, return (None, None)
1020         """
1021         if self._downloader is None:
1022             return (None, None)
1023
1024         downloader_params = self._downloader.params
1025
1026         # Attempt to use provided username and password or .netrc data
1027         if downloader_params.get(username_option) is not None:
1028             username = downloader_params[username_option]
1029             password = downloader_params[password_option]
1030         else:
1031             username, password = self._get_netrc_login_info(netrc_machine)
1032
1033         return username, password
1034
1035     def _get_tfa_info(self, note='two-factor verification code'):
1036         """
1037         Get the two-factor authentication info
1038         TODO - asking the user will be required for sms/phone verify
1039         currently just uses the command line option
1040         If there's no info available, return None
1041         """
1042         if self._downloader is None:
1043             return None
1044         downloader_params = self._downloader.params
1045
1046         if downloader_params.get('twofactor') is not None:
1047             return downloader_params['twofactor']
1048
1049         return compat_getpass('Type %s and press [Return]: ' % note)
1050
1051     # Helper functions for extracting OpenGraph info
1052     @staticmethod
1053     def _og_regexes(prop):
1054         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1055         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1056                        % {'prop': re.escape(prop)})
1057         template = r'<meta[^>]+?%s[^>]+?%s'
1058         return [
1059             template % (property_re, content_re),
1060             template % (content_re, property_re),
1061         ]
1062
1063     @staticmethod
1064     def _meta_regex(prop):
1065         return r'''(?isx)<meta
1066                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1067                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1068
1069     def _og_search_property(self, prop, html, name=None, **kargs):
1070         if not isinstance(prop, (list, tuple)):
1071             prop = [prop]
1072         if name is None:
1073             name = 'OpenGraph %s' % prop[0]
1074         og_regexes = []
1075         for p in prop:
1076             og_regexes.extend(self._og_regexes(p))
1077         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1078         if escaped is None:
1079             return None
1080         return unescapeHTML(escaped)
1081
1082     def _og_search_thumbnail(self, html, **kargs):
1083         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1084
1085     def _og_search_description(self, html, **kargs):
1086         return self._og_search_property('description', html, fatal=False, **kargs)
1087
1088     def _og_search_title(self, html, **kargs):
1089         return self._og_search_property('title', html, **kargs)
1090
1091     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1092         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1093         if secure:
1094             regexes = self._og_regexes('video:secure_url') + regexes
1095         return self._html_search_regex(regexes, html, name, **kargs)
1096
1097     def _og_search_url(self, html, **kargs):
1098         return self._og_search_property('url', html, **kargs)
1099
1100     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1101         if not isinstance(name, (list, tuple)):
1102             name = [name]
1103         if display_name is None:
1104             display_name = name[0]
1105         return self._html_search_regex(
1106             [self._meta_regex(n) for n in name],
1107             html, display_name, fatal=fatal, group='content', **kwargs)
1108
1109     def _dc_search_uploader(self, html):
1110         return self._html_search_meta('dc.creator', html, 'uploader')
1111
1112     def _rta_search(self, html):
1113         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1114         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1115                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1116                      html):
1117             return 18
1118         return 0
1119
1120     def _media_rating_search(self, html):
1121         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1122         rating = self._html_search_meta('rating', html)
1123
1124         if not rating:
1125             return None
1126
1127         RATING_TABLE = {
1128             'safe for kids': 0,
1129             'general': 8,
1130             '14 years': 14,
1131             'mature': 17,
1132             'restricted': 19,
1133         }
1134         return RATING_TABLE.get(rating.lower())
1135
1136     def _family_friendly_search(self, html):
1137         # See http://schema.org/VideoObject
1138         family_friendly = self._html_search_meta(
1139             'isFamilyFriendly', html, default=None)
1140
1141         if not family_friendly:
1142             return None
1143
1144         RATING_TABLE = {
1145             '1': 0,
1146             'true': 0,
1147             '0': 18,
1148             'false': 18,
1149         }
1150         return RATING_TABLE.get(family_friendly.lower())
1151
1152     def _twitter_search_player(self, html):
1153         return self._html_search_meta('twitter:player', html,
1154                                       'twitter card player')
1155
1156     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1157         json_ld = self._search_regex(
1158             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1159         default = kwargs.get('default', NO_DEFAULT)
1160         if not json_ld:
1161             return default if default is not NO_DEFAULT else {}
1162         # JSON-LD may be malformed and thus `fatal` should be respected.
1163         # At the same time `default` may be passed that assumes `fatal=False`
1164         # for _search_regex. Let's simulate the same behavior here as well.
1165         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1166         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1167
1168     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1169         if isinstance(json_ld, compat_str):
1170             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1171         if not json_ld:
1172             return {}
1173         info = {}
1174         if not isinstance(json_ld, (list, tuple, dict)):
1175             return info
1176         if isinstance(json_ld, dict):
1177             json_ld = [json_ld]
1178
1179         INTERACTION_TYPE_MAP = {
1180             'CommentAction': 'comment',
1181             'AgreeAction': 'like',
1182             'DisagreeAction': 'dislike',
1183             'LikeAction': 'like',
1184             'DislikeAction': 'dislike',
1185             'ListenAction': 'view',
1186             'WatchAction': 'view',
1187             'ViewAction': 'view',
1188         }
1189
1190         def extract_interaction_statistic(e):
1191             interaction_statistic = e.get('interactionStatistic')
1192             if not isinstance(interaction_statistic, list):
1193                 return
1194             for is_e in interaction_statistic:
1195                 if not isinstance(is_e, dict):
1196                     continue
1197                 if is_e.get('@type') != 'InteractionCounter':
1198                     continue
1199                 interaction_type = is_e.get('interactionType')
1200                 if not isinstance(interaction_type, compat_str):
1201                     continue
1202                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1203                 if interaction_count is None:
1204                     continue
1205                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1206                 if not count_kind:
1207                     continue
1208                 count_key = '%s_count' % count_kind
1209                 if info.get(count_key) is not None:
1210                     continue
1211                 info[count_key] = interaction_count
1212
1213         def extract_video_object(e):
1214             assert e['@type'] == 'VideoObject'
1215             info.update({
1216                 'url': e.get('contentUrl'),
1217                 'title': unescapeHTML(e.get('name')),
1218                 'description': unescapeHTML(e.get('description')),
1219                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1220                 'duration': parse_duration(e.get('duration')),
1221                 'timestamp': unified_timestamp(e.get('uploadDate')),
1222                 'filesize': float_or_none(e.get('contentSize')),
1223                 'tbr': int_or_none(e.get('bitrate')),
1224                 'width': int_or_none(e.get('width')),
1225                 'height': int_or_none(e.get('height')),
1226                 'view_count': int_or_none(e.get('interactionCount')),
1227             })
1228             extract_interaction_statistic(e)
1229
1230         for e in json_ld:
1231             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1232                 item_type = e.get('@type')
1233                 if expected_type is not None and expected_type != item_type:
1234                     return info
1235                 if item_type in ('TVEpisode', 'Episode'):
1236                     info.update({
1237                         'episode': unescapeHTML(e.get('name')),
1238                         'episode_number': int_or_none(e.get('episodeNumber')),
1239                         'description': unescapeHTML(e.get('description')),
1240                     })
1241                     part_of_season = e.get('partOfSeason')
1242                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1243                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1244                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1245                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1246                         info['series'] = unescapeHTML(part_of_series.get('name'))
1247                 elif item_type in ('Article', 'NewsArticle'):
1248                     info.update({
1249                         'timestamp': parse_iso8601(e.get('datePublished')),
1250                         'title': unescapeHTML(e.get('headline')),
1251                         'description': unescapeHTML(e.get('articleBody')),
1252                     })
1253                 elif item_type == 'VideoObject':
1254                     extract_video_object(e)
1255                     continue
1256                 video = e.get('video')
1257                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1258                     extract_video_object(video)
1259                 break
1260         return dict((k, v) for k, v in info.items() if v is not None)
1261
1262     @staticmethod
1263     def _hidden_inputs(html):
1264         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1265         hidden_inputs = {}
1266         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1267             attrs = extract_attributes(input)
1268             if not input:
1269                 continue
1270             if attrs.get('type') not in ('hidden', 'submit'):
1271                 continue
1272             name = attrs.get('name') or attrs.get('id')
1273             value = attrs.get('value')
1274             if name and value is not None:
1275                 hidden_inputs[name] = value
1276         return hidden_inputs
1277
1278     def _form_hidden_inputs(self, form_id, html):
1279         form = self._search_regex(
1280             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1281             html, '%s form' % form_id, group='form')
1282         return self._hidden_inputs(form)
1283
1284     def _sort_formats(self, formats, field_preference=None):
1285         if not formats:
1286             raise ExtractorError('No video formats found')
1287
1288         for f in formats:
1289             # Automatically determine tbr when missing based on abr and vbr (improves
1290             # formats sorting in some cases)
1291             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1292                 f['tbr'] = f['abr'] + f['vbr']
1293
1294         def _formats_key(f):
1295             # TODO remove the following workaround
1296             from ..utils import determine_ext
1297             if not f.get('ext') and 'url' in f:
1298                 f['ext'] = determine_ext(f['url'])
1299
1300             if isinstance(field_preference, (list, tuple)):
1301                 return tuple(
1302                     f.get(field)
1303                     if f.get(field) is not None
1304                     else ('' if field == 'format_id' else -1)
1305                     for field in field_preference)
1306
1307             preference = f.get('preference')
1308             if preference is None:
1309                 preference = 0
1310                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1311                     preference -= 0.5
1312
1313             protocol = f.get('protocol') or determine_protocol(f)
1314             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1315
1316             if f.get('vcodec') == 'none':  # audio only
1317                 preference -= 50
1318                 if self._downloader.params.get('prefer_free_formats'):
1319                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1320                 else:
1321                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1322                 ext_preference = 0
1323                 try:
1324                     audio_ext_preference = ORDER.index(f['ext'])
1325                 except ValueError:
1326                     audio_ext_preference = -1
1327             else:
1328                 if f.get('acodec') == 'none':  # video only
1329                     preference -= 40
1330                 if self._downloader.params.get('prefer_free_formats'):
1331                     ORDER = ['flv', 'mp4', 'webm']
1332                 else:
1333                     ORDER = ['webm', 'flv', 'mp4']
1334                 try:
1335                     ext_preference = ORDER.index(f['ext'])
1336                 except ValueError:
1337                     ext_preference = -1
1338                 audio_ext_preference = 0
1339
1340             return (
1341                 preference,
1342                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1343                 f.get('quality') if f.get('quality') is not None else -1,
1344                 f.get('tbr') if f.get('tbr') is not None else -1,
1345                 f.get('filesize') if f.get('filesize') is not None else -1,
1346                 f.get('vbr') if f.get('vbr') is not None else -1,
1347                 f.get('height') if f.get('height') is not None else -1,
1348                 f.get('width') if f.get('width') is not None else -1,
1349                 proto_preference,
1350                 ext_preference,
1351                 f.get('abr') if f.get('abr') is not None else -1,
1352                 audio_ext_preference,
1353                 f.get('fps') if f.get('fps') is not None else -1,
1354                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1355                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1356                 f.get('format_id') if f.get('format_id') is not None else '',
1357             )
1358         formats.sort(key=_formats_key)
1359
1360     def _check_formats(self, formats, video_id):
1361         if formats:
1362             formats[:] = filter(
1363                 lambda f: self._is_valid_url(
1364                     f['url'], video_id,
1365                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1366                 formats)
1367
1368     @staticmethod
1369     def _remove_duplicate_formats(formats):
1370         format_urls = set()
1371         unique_formats = []
1372         for f in formats:
1373             if f['url'] not in format_urls:
1374                 format_urls.add(f['url'])
1375                 unique_formats.append(f)
1376         formats[:] = unique_formats
1377
1378     def _is_valid_url(self, url, video_id, item='video', headers={}):
1379         url = self._proto_relative_url(url, scheme='http:')
1380         # For now assume non HTTP(S) URLs always valid
1381         if not (url.startswith('http://') or url.startswith('https://')):
1382             return True
1383         try:
1384             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1385             return True
1386         except ExtractorError as e:
1387             if isinstance(e.cause, compat_urllib_error.URLError):
1388                 self.to_screen(
1389                     '%s: %s URL is invalid, skipping' % (video_id, item))
1390                 return False
1391             raise
1392
1393     def http_scheme(self):
1394         """ Either "http:" or "https:", depending on the user's preferences """
1395         return (
1396             'http:'
1397             if self._downloader.params.get('prefer_insecure', False)
1398             else 'https:')
1399
1400     def _proto_relative_url(self, url, scheme=None):
1401         if url is None:
1402             return url
1403         if url.startswith('//'):
1404             if scheme is None:
1405                 scheme = self.http_scheme()
1406             return scheme + url
1407         else:
1408             return url
1409
1410     def _sleep(self, timeout, video_id, msg_template=None):
1411         if msg_template is None:
1412             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1413         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1414         self.to_screen(msg)
1415         time.sleep(timeout)
1416
1417     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1418                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1419                              fatal=True, m3u8_id=None):
1420         manifest = self._download_xml(
1421             manifest_url, video_id, 'Downloading f4m manifest',
1422             'Unable to download f4m manifest',
1423             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1424             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1425             transform_source=transform_source,
1426             fatal=fatal)
1427
1428         if manifest is False:
1429             return []
1430
1431         return self._parse_f4m_formats(
1432             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1433             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1434
1435     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1436                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1437                            fatal=True, m3u8_id=None):
1438         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1439         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1440         if akamai_pv is not None and ';' in akamai_pv.text:
1441             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1442             if playerVerificationChallenge.strip() != '':
1443                 return []
1444
1445         formats = []
1446         manifest_version = '1.0'
1447         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1448         if not media_nodes:
1449             manifest_version = '2.0'
1450             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1451         # Remove unsupported DRM protected media from final formats
1452         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1453         media_nodes = remove_encrypted_media(media_nodes)
1454         if not media_nodes:
1455             return formats
1456
1457         manifest_base_url = get_base_url(manifest)
1458
1459         bootstrap_info = xpath_element(
1460             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1461             'bootstrap info', default=None)
1462
1463         vcodec = None
1464         mime_type = xpath_text(
1465             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1466             'base URL', default=None)
1467         if mime_type and mime_type.startswith('audio/'):
1468             vcodec = 'none'
1469
1470         for i, media_el in enumerate(media_nodes):
1471             tbr = int_or_none(media_el.attrib.get('bitrate'))
1472             width = int_or_none(media_el.attrib.get('width'))
1473             height = int_or_none(media_el.attrib.get('height'))
1474             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1475             # If <bootstrapInfo> is present, the specified f4m is a
1476             # stream-level manifest, and only set-level manifests may refer to
1477             # external resources.  See section 11.4 and section 4 of F4M spec
1478             if bootstrap_info is None:
1479                 media_url = None
1480                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1481                 if manifest_version == '2.0':
1482                     media_url = media_el.attrib.get('href')
1483                 if media_url is None:
1484                     media_url = media_el.attrib.get('url')
1485                 if not media_url:
1486                     continue
1487                 manifest_url = (
1488                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1489                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1490                 # If media_url is itself a f4m manifest do the recursive extraction
1491                 # since bitrates in parent manifest (this one) and media_url manifest
1492                 # may differ leading to inability to resolve the format by requested
1493                 # bitrate in f4m downloader
1494                 ext = determine_ext(manifest_url)
1495                 if ext == 'f4m':
1496                     f4m_formats = self._extract_f4m_formats(
1497                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1498                         transform_source=transform_source, fatal=fatal)
1499                     # Sometimes stream-level manifest contains single media entry that
1500                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1501                     # At the same time parent's media entry in set-level manifest may
1502                     # contain it. We will copy it from parent in such cases.
1503                     if len(f4m_formats) == 1:
1504                         f = f4m_formats[0]
1505                         f.update({
1506                             'tbr': f.get('tbr') or tbr,
1507                             'width': f.get('width') or width,
1508                             'height': f.get('height') or height,
1509                             'format_id': f.get('format_id') if not tbr else format_id,
1510                             'vcodec': vcodec,
1511                         })
1512                     formats.extend(f4m_formats)
1513                     continue
1514                 elif ext == 'm3u8':
1515                     formats.extend(self._extract_m3u8_formats(
1516                         manifest_url, video_id, 'mp4', preference=preference,
1517                         m3u8_id=m3u8_id, fatal=fatal))
1518                     continue
1519             formats.append({
1520                 'format_id': format_id,
1521                 'url': manifest_url,
1522                 'manifest_url': manifest_url,
1523                 'ext': 'flv' if bootstrap_info is not None else None,
1524                 'protocol': 'f4m',
1525                 'tbr': tbr,
1526                 'width': width,
1527                 'height': height,
1528                 'vcodec': vcodec,
1529                 'preference': preference,
1530             })
1531         return formats
1532
1533     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1534         return {
1535             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1536             'url': m3u8_url,
1537             'ext': ext,
1538             'protocol': 'm3u8',
1539             'preference': preference - 100 if preference else -100,
1540             'resolution': 'multiple',
1541             'format_note': 'Quality selection URL',
1542         }
1543
1544     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1545                               entry_protocol='m3u8', preference=None,
1546                               m3u8_id=None, note=None, errnote=None,
1547                               fatal=True, live=False):
1548         res = self._download_webpage_handle(
1549             m3u8_url, video_id,
1550             note=note or 'Downloading m3u8 information',
1551             errnote=errnote or 'Failed to download m3u8 information',
1552             fatal=fatal)
1553
1554         if res is False:
1555             return []
1556
1557         m3u8_doc, urlh = res
1558         m3u8_url = urlh.geturl()
1559
1560         return self._parse_m3u8_formats(
1561             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1562             preference=preference, m3u8_id=m3u8_id, live=live)
1563
1564     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1565                             entry_protocol='m3u8', preference=None,
1566                             m3u8_id=None, live=False):
1567         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1568             return []
1569
1570         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1571             return []
1572
1573         formats = []
1574
1575         format_url = lambda u: (
1576             u
1577             if re.match(r'^https?://', u)
1578             else compat_urlparse.urljoin(m3u8_url, u))
1579
1580         # References:
1581         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1582         # 2. https://github.com/rg3/youtube-dl/issues/12211
1583
1584         # We should try extracting formats only from master playlists [1, 4.3.4],
1585         # i.e. playlists that describe available qualities. On the other hand
1586         # media playlists [1, 4.3.3] should be returned as is since they contain
1587         # just the media without qualities renditions.
1588         # Fortunately, master playlist can be easily distinguished from media
1589         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1590         # master playlist tags MUST NOT appear in a media playist and vice versa.
1591         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1592         # media playlist and MUST NOT appear in master playlist thus we can
1593         # clearly detect media playlist with this criterion.
1594
1595         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1596             return [{
1597                 'url': m3u8_url,
1598                 'format_id': m3u8_id,
1599                 'ext': ext,
1600                 'protocol': entry_protocol,
1601                 'preference': preference,
1602             }]
1603
1604         groups = {}
1605         last_stream_inf = {}
1606
1607         def extract_media(x_media_line):
1608             media = parse_m3u8_attributes(x_media_line)
1609             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1610             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1611             if not (media_type and group_id and name):
1612                 return
1613             groups.setdefault(group_id, []).append(media)
1614             if media_type not in ('VIDEO', 'AUDIO'):
1615                 return
1616             media_url = media.get('URI')
1617             if media_url:
1618                 format_id = []
1619                 for v in (m3u8_id, group_id, name):
1620                     if v:
1621                         format_id.append(v)
1622                 f = {
1623                     'format_id': '-'.join(format_id),
1624                     'url': format_url(media_url),
1625                     'manifest_url': m3u8_url,
1626                     'language': media.get('LANGUAGE'),
1627                     'ext': ext,
1628                     'protocol': entry_protocol,
1629                     'preference': preference,
1630                 }
1631                 if media_type == 'AUDIO':
1632                     f['vcodec'] = 'none'
1633                 formats.append(f)
1634
1635         def build_stream_name():
1636             # Despite specification does not mention NAME attribute for
1637             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1638             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1639             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1640             stream_name = last_stream_inf.get('NAME')
1641             if stream_name:
1642                 return stream_name
1643             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1644             # from corresponding rendition group
1645             stream_group_id = last_stream_inf.get('VIDEO')
1646             if not stream_group_id:
1647                 return
1648             stream_group = groups.get(stream_group_id)
1649             if not stream_group:
1650                 return stream_group_id
1651             rendition = stream_group[0]
1652             return rendition.get('NAME') or stream_group_id
1653
1654         for line in m3u8_doc.splitlines():
1655             if line.startswith('#EXT-X-STREAM-INF:'):
1656                 last_stream_inf = parse_m3u8_attributes(line)
1657             elif line.startswith('#EXT-X-MEDIA:'):
1658                 extract_media(line)
1659             elif line.startswith('#') or not line.strip():
1660                 continue
1661             else:
1662                 tbr = float_or_none(
1663                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1664                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1665                 format_id = []
1666                 if m3u8_id:
1667                     format_id.append(m3u8_id)
1668                 stream_name = build_stream_name()
1669                 # Bandwidth of live streams may differ over time thus making
1670                 # format_id unpredictable. So it's better to keep provided
1671                 # format_id intact.
1672                 if not live:
1673                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1674                 manifest_url = format_url(line.strip())
1675                 f = {
1676                     'format_id': '-'.join(format_id),
1677                     'url': manifest_url,
1678                     'manifest_url': m3u8_url,
1679                     'tbr': tbr,
1680                     'ext': ext,
1681                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1682                     'protocol': entry_protocol,
1683                     'preference': preference,
1684                 }
1685                 resolution = last_stream_inf.get('RESOLUTION')
1686                 if resolution:
1687                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1688                     if mobj:
1689                         f['width'] = int(mobj.group('width'))
1690                         f['height'] = int(mobj.group('height'))
1691                 # Unified Streaming Platform
1692                 mobj = re.search(
1693                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1694                 if mobj:
1695                     abr, vbr = mobj.groups()
1696                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1697                     f.update({
1698                         'vbr': vbr,
1699                         'abr': abr,
1700                     })
1701                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1702                 f.update(codecs)
1703                 audio_group_id = last_stream_inf.get('AUDIO')
1704                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1705                 # references a rendition group MUST have a CODECS attribute.
1706                 # However, this is not always respected, for example, [2]
1707                 # contains EXT-X-STREAM-INF tag which references AUDIO
1708                 # rendition group but does not have CODECS and despite
1709                 # referencing audio group an audio group, it represents
1710                 # a complete (with audio and video) format. So, for such cases
1711                 # we will ignore references to rendition groups and treat them
1712                 # as complete formats.
1713                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1714                     audio_group = groups.get(audio_group_id)
1715                     if audio_group and audio_group[0].get('URI'):
1716                         # TODO: update acodec for audio only formats with
1717                         # the same GROUP-ID
1718                         f['acodec'] = 'none'
1719                 formats.append(f)
1720                 last_stream_inf = {}
1721         return formats
1722
1723     @staticmethod
1724     def _xpath_ns(path, namespace=None):
1725         if not namespace:
1726             return path
1727         out = []
1728         for c in path.split('/'):
1729             if not c or c == '.':
1730                 out.append(c)
1731             else:
1732                 out.append('{%s}%s' % (namespace, c))
1733         return '/'.join(out)
1734
1735     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1736         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1737
1738         if smil is False:
1739             assert not fatal
1740             return []
1741
1742         namespace = self._parse_smil_namespace(smil)
1743
1744         return self._parse_smil_formats(
1745             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1746
1747     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1748         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1749         if smil is False:
1750             return {}
1751         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1752
1753     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1754         return self._download_xml(
1755             smil_url, video_id, 'Downloading SMIL file',
1756             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1757
1758     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1759         namespace = self._parse_smil_namespace(smil)
1760
1761         formats = self._parse_smil_formats(
1762             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1763         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1764
1765         video_id = os.path.splitext(url_basename(smil_url))[0]
1766         title = None
1767         description = None
1768         upload_date = None
1769         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1770             name = meta.attrib.get('name')
1771             content = meta.attrib.get('content')
1772             if not name or not content:
1773                 continue
1774             if not title and name == 'title':
1775                 title = content
1776             elif not description and name in ('description', 'abstract'):
1777                 description = content
1778             elif not upload_date and name == 'date':
1779                 upload_date = unified_strdate(content)
1780
1781         thumbnails = [{
1782             'id': image.get('type'),
1783             'url': image.get('src'),
1784             'width': int_or_none(image.get('width')),
1785             'height': int_or_none(image.get('height')),
1786         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1787
1788         return {
1789             'id': video_id,
1790             'title': title or video_id,
1791             'description': description,
1792             'upload_date': upload_date,
1793             'thumbnails': thumbnails,
1794             'formats': formats,
1795             'subtitles': subtitles,
1796         }
1797
1798     def _parse_smil_namespace(self, smil):
1799         return self._search_regex(
1800             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1801
1802     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1803         base = smil_url
1804         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1805             b = meta.get('base') or meta.get('httpBase')
1806             if b:
1807                 base = b
1808                 break
1809
1810         formats = []
1811         rtmp_count = 0
1812         http_count = 0
1813         m3u8_count = 0
1814
1815         srcs = []
1816         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1817         for medium in media:
1818             src = medium.get('src')
1819             if not src or src in srcs:
1820                 continue
1821             srcs.append(src)
1822
1823             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1824             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1825             width = int_or_none(medium.get('width'))
1826             height = int_or_none(medium.get('height'))
1827             proto = medium.get('proto')
1828             ext = medium.get('ext')
1829             src_ext = determine_ext(src)
1830             streamer = medium.get('streamer') or base
1831
1832             if proto == 'rtmp' or streamer.startswith('rtmp'):
1833                 rtmp_count += 1
1834                 formats.append({
1835                     'url': streamer,
1836                     'play_path': src,
1837                     'ext': 'flv',
1838                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1839                     'tbr': bitrate,
1840                     'filesize': filesize,
1841                     'width': width,
1842                     'height': height,
1843                 })
1844                 if transform_rtmp_url:
1845                     streamer, src = transform_rtmp_url(streamer, src)
1846                     formats[-1].update({
1847                         'url': streamer,
1848                         'play_path': src,
1849                     })
1850                 continue
1851
1852             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1853             src_url = src_url.strip()
1854
1855             if proto == 'm3u8' or src_ext == 'm3u8':
1856                 m3u8_formats = self._extract_m3u8_formats(
1857                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1858                 if len(m3u8_formats) == 1:
1859                     m3u8_count += 1
1860                     m3u8_formats[0].update({
1861                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1862                         'tbr': bitrate,
1863                         'width': width,
1864                         'height': height,
1865                     })
1866                 formats.extend(m3u8_formats)
1867             elif src_ext == 'f4m':
1868                 f4m_url = src_url
1869                 if not f4m_params:
1870                     f4m_params = {
1871                         'hdcore': '3.2.0',
1872                         'plugin': 'flowplayer-3.2.0.1',
1873                     }
1874                 f4m_url += '&' if '?' in f4m_url else '?'
1875                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1876                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1877             elif src_ext == 'mpd':
1878                 formats.extend(self._extract_mpd_formats(
1879                     src_url, video_id, mpd_id='dash', fatal=False))
1880             elif re.search(r'\.ism/[Mm]anifest', src_url):
1881                 formats.extend(self._extract_ism_formats(
1882                     src_url, video_id, ism_id='mss', fatal=False))
1883             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1884                 http_count += 1
1885                 formats.append({
1886                     'url': src_url,
1887                     'ext': ext or src_ext or 'flv',
1888                     'format_id': 'http-%d' % (bitrate or http_count),
1889                     'tbr': bitrate,
1890                     'filesize': filesize,
1891                     'width': width,
1892                     'height': height,
1893                 })
1894
1895         return formats
1896
1897     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1898         urls = []
1899         subtitles = {}
1900         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1901             src = textstream.get('src')
1902             if not src or src in urls:
1903                 continue
1904             urls.append(src)
1905             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1906             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1907             subtitles.setdefault(lang, []).append({
1908                 'url': src,
1909                 'ext': ext,
1910             })
1911         return subtitles
1912
1913     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1914         xspf = self._download_xml(
1915             xspf_url, playlist_id, 'Downloading xpsf playlist',
1916             'Unable to download xspf manifest', fatal=fatal)
1917         if xspf is False:
1918             return []
1919         return self._parse_xspf(
1920             xspf, playlist_id, xspf_url=xspf_url,
1921             xspf_base_url=base_url(xspf_url))
1922
1923     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1924         NS_MAP = {
1925             'xspf': 'http://xspf.org/ns/0/',
1926             's1': 'http://static.streamone.nl/player/ns/0',
1927         }
1928
1929         entries = []
1930         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1931             title = xpath_text(
1932                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1933             description = xpath_text(
1934                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1935             thumbnail = xpath_text(
1936                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1937             duration = float_or_none(
1938                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1939
1940             formats = []
1941             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1942                 format_url = urljoin(xspf_base_url, location.text)
1943                 if not format_url:
1944                     continue
1945                 formats.append({
1946                     'url': format_url,
1947                     'manifest_url': xspf_url,
1948                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1949                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1950                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1951                 })
1952             self._sort_formats(formats)
1953
1954             entries.append({
1955                 'id': playlist_id,
1956                 'title': title,
1957                 'description': description,
1958                 'thumbnail': thumbnail,
1959                 'duration': duration,
1960                 'formats': formats,
1961             })
1962         return entries
1963
1964     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1965         res = self._download_xml_handle(
1966             mpd_url, video_id,
1967             note=note or 'Downloading MPD manifest',
1968             errnote=errnote or 'Failed to download MPD manifest',
1969             fatal=fatal)
1970         if res is False:
1971             return []
1972         mpd_doc, urlh = res
1973         mpd_base_url = base_url(urlh.geturl())
1974
1975         return self._parse_mpd_formats(
1976             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1977             formats_dict=formats_dict, mpd_url=mpd_url)
1978
1979     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1980         """
1981         Parse formats from MPD manifest.
1982         References:
1983          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1984             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1985          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1986         """
1987         if mpd_doc.get('type') == 'dynamic':
1988             return []
1989
1990         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1991
1992         def _add_ns(path):
1993             return self._xpath_ns(path, namespace)
1994
1995         def is_drm_protected(element):
1996             return element.find(_add_ns('ContentProtection')) is not None
1997
1998         def extract_multisegment_info(element, ms_parent_info):
1999             ms_info = ms_parent_info.copy()
2000
2001             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2002             # common attributes and elements.  We will only extract relevant
2003             # for us.
2004             def extract_common(source):
2005                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2006                 if segment_timeline is not None:
2007                     s_e = segment_timeline.findall(_add_ns('S'))
2008                     if s_e:
2009                         ms_info['total_number'] = 0
2010                         ms_info['s'] = []
2011                         for s in s_e:
2012                             r = int(s.get('r', 0))
2013                             ms_info['total_number'] += 1 + r
2014                             ms_info['s'].append({
2015                                 't': int(s.get('t', 0)),
2016                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2017                                 'd': int(s.attrib['d']),
2018                                 'r': r,
2019                             })
2020                 start_number = source.get('startNumber')
2021                 if start_number:
2022                     ms_info['start_number'] = int(start_number)
2023                 timescale = source.get('timescale')
2024                 if timescale:
2025                     ms_info['timescale'] = int(timescale)
2026                 segment_duration = source.get('duration')
2027                 if segment_duration:
2028                     ms_info['segment_duration'] = float(segment_duration)
2029
2030             def extract_Initialization(source):
2031                 initialization = source.find(_add_ns('Initialization'))
2032                 if initialization is not None:
2033                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2034
2035             segment_list = element.find(_add_ns('SegmentList'))
2036             if segment_list is not None:
2037                 extract_common(segment_list)
2038                 extract_Initialization(segment_list)
2039                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2040                 if segment_urls_e:
2041                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2042             else:
2043                 segment_template = element.find(_add_ns('SegmentTemplate'))
2044                 if segment_template is not None:
2045                     extract_common(segment_template)
2046                     media = segment_template.get('media')
2047                     if media:
2048                         ms_info['media'] = media
2049                     initialization = segment_template.get('initialization')
2050                     if initialization:
2051                         ms_info['initialization'] = initialization
2052                     else:
2053                         extract_Initialization(segment_template)
2054             return ms_info
2055
2056         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2057         formats = []
2058         for period in mpd_doc.findall(_add_ns('Period')):
2059             period_duration = parse_duration(period.get('duration')) or mpd_duration
2060             period_ms_info = extract_multisegment_info(period, {
2061                 'start_number': 1,
2062                 'timescale': 1,
2063             })
2064             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2065                 if is_drm_protected(adaptation_set):
2066                     continue
2067                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2068                 for representation in adaptation_set.findall(_add_ns('Representation')):
2069                     if is_drm_protected(representation):
2070                         continue
2071                     representation_attrib = adaptation_set.attrib.copy()
2072                     representation_attrib.update(representation.attrib)
2073                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2074                     mime_type = representation_attrib['mimeType']
2075                     content_type = mime_type.split('/')[0]
2076                     if content_type == 'text':
2077                         # TODO implement WebVTT downloading
2078                         pass
2079                     elif content_type in ('video', 'audio'):
2080                         base_url = ''
2081                         for element in (representation, adaptation_set, period, mpd_doc):
2082                             base_url_e = element.find(_add_ns('BaseURL'))
2083                             if base_url_e is not None:
2084                                 base_url = base_url_e.text + base_url
2085                                 if re.match(r'^https?://', base_url):
2086                                     break
2087                         if mpd_base_url and not re.match(r'^https?://', base_url):
2088                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2089                                 mpd_base_url += '/'
2090                             base_url = mpd_base_url + base_url
2091                         representation_id = representation_attrib.get('id')
2092                         lang = representation_attrib.get('lang')
2093                         url_el = representation.find(_add_ns('BaseURL'))
2094                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2095                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2096                         f = {
2097                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2098                             'url': base_url,
2099                             'manifest_url': mpd_url,
2100                             'ext': mimetype2ext(mime_type),
2101                             'width': int_or_none(representation_attrib.get('width')),
2102                             'height': int_or_none(representation_attrib.get('height')),
2103                             'tbr': float_or_none(bandwidth, 1000),
2104                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2105                             'fps': int_or_none(representation_attrib.get('frameRate')),
2106                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2107                             'format_note': 'DASH %s' % content_type,
2108                             'filesize': filesize,
2109                             'container': mimetype2ext(mime_type) + '_dash',
2110                         }
2111                         f.update(parse_codecs(representation_attrib.get('codecs')))
2112                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2113
2114                         def prepare_template(template_name, identifiers):
2115                             tmpl = representation_ms_info[template_name]
2116                             # First of, % characters outside $...$ templates
2117                             # must be escaped by doubling for proper processing
2118                             # by % operator string formatting used further (see
2119                             # https://github.com/rg3/youtube-dl/issues/16867).
2120                             t = ''
2121                             in_template = False
2122                             for c in tmpl:
2123                                 t += c
2124                                 if c == '$':
2125                                     in_template = not in_template
2126                                 elif c == '%' and not in_template:
2127                                     t += c
2128                             # Next, $...$ templates are translated to their
2129                             # %(...) counterparts to be used with % operator
2130                             t = t.replace('$RepresentationID$', representation_id)
2131                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2132                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2133                             t.replace('$$', '$')
2134                             return t
2135
2136                         # @initialization is a regular template like @media one
2137                         # so it should be handled just the same way (see
2138                         # https://github.com/rg3/youtube-dl/issues/11605)
2139                         if 'initialization' in representation_ms_info:
2140                             initialization_template = prepare_template(
2141                                 'initialization',
2142                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2143                                 # $Time$ shall not be included for @initialization thus
2144                                 # only $Bandwidth$ remains
2145                                 ('Bandwidth', ))
2146                             representation_ms_info['initialization_url'] = initialization_template % {
2147                                 'Bandwidth': bandwidth,
2148                             }
2149
2150                         def location_key(location):
2151                             return 'url' if re.match(r'^https?://', location) else 'path'
2152
2153                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2154
2155                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2156                             media_location_key = location_key(media_template)
2157
2158                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2159                             # can't be used at the same time
2160                             if '%(Number' in media_template and 's' not in representation_ms_info:
2161                                 segment_duration = None
2162                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2163                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2164                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2165                                 representation_ms_info['fragments'] = [{
2166                                     media_location_key: media_template % {
2167                                         'Number': segment_number,
2168                                         'Bandwidth': bandwidth,
2169                                     },
2170                                     'duration': segment_duration,
2171                                 } for segment_number in range(
2172                                     representation_ms_info['start_number'],
2173                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2174                             else:
2175                                 # $Number*$ or $Time$ in media template with S list available
2176                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2177                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2178                                 representation_ms_info['fragments'] = []
2179                                 segment_time = 0
2180                                 segment_d = None
2181                                 segment_number = representation_ms_info['start_number']
2182
2183                                 def add_segment_url():
2184                                     segment_url = media_template % {
2185                                         'Time': segment_time,
2186                                         'Bandwidth': bandwidth,
2187                                         'Number': segment_number,
2188                                     }
2189                                     representation_ms_info['fragments'].append({
2190                                         media_location_key: segment_url,
2191                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2192                                     })
2193
2194                                 for num, s in enumerate(representation_ms_info['s']):
2195                                     segment_time = s.get('t') or segment_time
2196                                     segment_d = s['d']
2197                                     add_segment_url()
2198                                     segment_number += 1
2199                                     for r in range(s.get('r', 0)):
2200                                         segment_time += segment_d
2201                                         add_segment_url()
2202                                         segment_number += 1
2203                                     segment_time += segment_d
2204                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2205                             # No media template
2206                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2207                             # or any YouTube dashsegments video
2208                             fragments = []
2209                             segment_index = 0
2210                             timescale = representation_ms_info['timescale']
2211                             for s in representation_ms_info['s']:
2212                                 duration = float_or_none(s['d'], timescale)
2213                                 for r in range(s.get('r', 0) + 1):
2214                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2215                                     fragments.append({
2216                                         location_key(segment_uri): segment_uri,
2217                                         'duration': duration,
2218                                     })
2219                                     segment_index += 1
2220                             representation_ms_info['fragments'] = fragments
2221                         elif 'segment_urls' in representation_ms_info:
2222                             # Segment URLs with no SegmentTimeline
2223                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2224                             # https://github.com/rg3/youtube-dl/pull/14844
2225                             fragments = []
2226                             segment_duration = float_or_none(
2227                                 representation_ms_info['segment_duration'],
2228                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2229                             for segment_url in representation_ms_info['segment_urls']:
2230                                 fragment = {
2231                                     location_key(segment_url): segment_url,
2232                                 }
2233                                 if segment_duration:
2234                                     fragment['duration'] = segment_duration
2235                                 fragments.append(fragment)
2236                             representation_ms_info['fragments'] = fragments
2237                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2238                         # No fragments key is present in this case.
2239                         if 'fragments' in representation_ms_info:
2240                             f.update({
2241                                 'fragment_base_url': base_url,
2242                                 'fragments': [],
2243                                 'protocol': 'http_dash_segments',
2244                             })
2245                             if 'initialization_url' in representation_ms_info:
2246                                 initialization_url = representation_ms_info['initialization_url']
2247                                 if not f.get('url'):
2248                                     f['url'] = initialization_url
2249                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2250                             f['fragments'].extend(representation_ms_info['fragments'])
2251                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2252                         # is not necessarily unique within a Period thus formats with
2253                         # the same `format_id` are quite possible. There are numerous examples
2254                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2255                         # https://github.com/rg3/youtube-dl/issues/13919)
2256                         full_info = formats_dict.get(representation_id, {}).copy()
2257                         full_info.update(f)
2258                         formats.append(full_info)
2259                     else:
2260                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2261         return formats
2262
2263     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2264         res = self._download_xml_handle(
2265             ism_url, video_id,
2266             note=note or 'Downloading ISM manifest',
2267             errnote=errnote or 'Failed to download ISM manifest',
2268             fatal=fatal)
2269         if res is False:
2270             return []
2271         ism_doc, urlh = res
2272
2273         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2274
2275     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2276         """
2277         Parse formats from ISM manifest.
2278         References:
2279          1. [MS-SSTR]: Smooth Streaming Protocol,
2280             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2281         """
2282         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2283             return []
2284
2285         duration = int(ism_doc.attrib['Duration'])
2286         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2287
2288         formats = []
2289         for stream in ism_doc.findall('StreamIndex'):
2290             stream_type = stream.get('Type')
2291             if stream_type not in ('video', 'audio'):
2292                 continue
2293             url_pattern = stream.attrib['Url']
2294             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2295             stream_name = stream.get('Name')
2296             for track in stream.findall('QualityLevel'):
2297                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2298                 # TODO: add support for WVC1 and WMAP
2299                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2300                     self.report_warning('%s is not a supported codec' % fourcc)
2301                     continue
2302                 tbr = int(track.attrib['Bitrate']) // 1000
2303                 # [1] does not mention Width and Height attributes. However,
2304                 # they're often present while MaxWidth and MaxHeight are
2305                 # missing, so should be used as fallbacks
2306                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2307                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2308                 sampling_rate = int_or_none(track.get('SamplingRate'))
2309
2310                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2311                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2312
2313                 fragments = []
2314                 fragment_ctx = {
2315                     'time': 0,
2316                 }
2317                 stream_fragments = stream.findall('c')
2318                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2319                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2320                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2321                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2322                     if not fragment_ctx['duration']:
2323                         try:
2324                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2325                         except IndexError:
2326                             next_fragment_time = duration
2327                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2328                     for _ in range(fragment_repeat):
2329                         fragments.append({
2330                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2331                             'duration': fragment_ctx['duration'] / stream_timescale,
2332                         })
2333                         fragment_ctx['time'] += fragment_ctx['duration']
2334
2335                 format_id = []
2336                 if ism_id:
2337                     format_id.append(ism_id)
2338                 if stream_name:
2339                     format_id.append(stream_name)
2340                 format_id.append(compat_str(tbr))
2341
2342                 formats.append({
2343                     'format_id': '-'.join(format_id),
2344                     'url': ism_url,
2345                     'manifest_url': ism_url,
2346                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2347                     'width': width,
2348                     'height': height,
2349                     'tbr': tbr,
2350                     'asr': sampling_rate,
2351                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2352                     'acodec': 'none' if stream_type == 'video' else fourcc,
2353                     'protocol': 'ism',
2354                     'fragments': fragments,
2355                     '_download_params': {
2356                         'duration': duration,
2357                         'timescale': stream_timescale,
2358                         'width': width or 0,
2359                         'height': height or 0,
2360                         'fourcc': fourcc,
2361                         'codec_private_data': track.get('CodecPrivateData'),
2362                         'sampling_rate': sampling_rate,
2363                         'channels': int_or_none(track.get('Channels', 2)),
2364                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2365                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2366                     },
2367                 })
2368         return formats
2369
2370     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2371         def absolute_url(item_url):
2372             return urljoin(base_url, item_url)
2373
2374         def parse_content_type(content_type):
2375             if not content_type:
2376                 return {}
2377             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2378             if ctr:
2379                 mimetype, codecs = ctr.groups()
2380                 f = parse_codecs(codecs)
2381                 f['ext'] = mimetype2ext(mimetype)
2382                 return f
2383             return {}
2384
2385         def _media_formats(src, cur_media_type, type_info={}):
2386             full_url = absolute_url(src)
2387             ext = type_info.get('ext') or determine_ext(full_url)
2388             if ext == 'm3u8':
2389                 is_plain_url = False
2390                 formats = self._extract_m3u8_formats(
2391                     full_url, video_id, ext='mp4',
2392                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2393                     preference=preference, fatal=False)
2394             elif ext == 'mpd':
2395                 is_plain_url = False
2396                 formats = self._extract_mpd_formats(
2397                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2398             else:
2399                 is_plain_url = True
2400                 formats = [{
2401                     'url': full_url,
2402                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2403                 }]
2404             return is_plain_url, formats
2405
2406         entries = []
2407         # amp-video and amp-audio are very similar to their HTML5 counterparts
2408         # so we wll include them right here (see
2409         # https://www.ampproject.org/docs/reference/components/amp-video)
2410         media_tags = [(media_tag, media_type, '')
2411                       for media_tag, media_type
2412                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2413         media_tags.extend(re.findall(
2414             # We only allow video|audio followed by a whitespace or '>'.
2415             # Allowing more characters may end up in significant slow down (see
2416             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2417             # http://www.porntrex.com/maps/videositemap.xml).
2418             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2419         for media_tag, media_type, media_content in media_tags:
2420             media_info = {
2421                 'formats': [],
2422                 'subtitles': {},
2423             }
2424             media_attributes = extract_attributes(media_tag)
2425             src = media_attributes.get('src')
2426             if src:
2427                 _, formats = _media_formats(src, media_type)
2428                 media_info['formats'].extend(formats)
2429             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2430             if media_content:
2431                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2432                     source_attributes = extract_attributes(source_tag)
2433                     src = source_attributes.get('src')
2434                     if not src:
2435                         continue
2436                     f = parse_content_type(source_attributes.get('type'))
2437                     is_plain_url, formats = _media_formats(src, media_type, f)
2438                     if is_plain_url:
2439                         # res attribute is not standard but seen several times
2440                         # in the wild
2441                         f.update({
2442                             'height': int_or_none(source_attributes.get('res')),
2443                             'format_id': source_attributes.get('label'),
2444                         })
2445                         f.update(formats[0])
2446                         media_info['formats'].append(f)
2447                     else:
2448                         media_info['formats'].extend(formats)
2449                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2450                     track_attributes = extract_attributes(track_tag)
2451                     kind = track_attributes.get('kind')
2452                     if not kind or kind in ('subtitles', 'captions'):
2453                         src = track_attributes.get('src')
2454                         if not src:
2455                             continue
2456                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2457                         media_info['subtitles'].setdefault(lang, []).append({
2458                             'url': absolute_url(src),
2459                         })
2460             for f in media_info['formats']:
2461                 f.setdefault('http_headers', {})['Referer'] = base_url
2462             if media_info['formats'] or media_info['subtitles']:
2463                 entries.append(media_info)
2464         return entries
2465
2466     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2467         formats = []
2468         hdcore_sign = 'hdcore=3.7.0'
2469         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2470         hds_host = hosts.get('hds')
2471         if hds_host:
2472             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2473         if 'hdcore=' not in f4m_url:
2474             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2475         f4m_formats = self._extract_f4m_formats(
2476             f4m_url, video_id, f4m_id='hds', fatal=False)
2477         for entry in f4m_formats:
2478             entry.update({'extra_param_to_segment_url': hdcore_sign})
2479         formats.extend(f4m_formats)
2480         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2481         hls_host = hosts.get('hls')
2482         if hls_host:
2483             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2484         formats.extend(self._extract_m3u8_formats(
2485             m3u8_url, video_id, 'mp4', 'm3u8_native',
2486             m3u8_id='hls', fatal=False))
2487         return formats
2488
2489     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2490         query = compat_urlparse.urlparse(url).query
2491         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2492         mobj = re.search(
2493             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2494         url_base = mobj.group('url')
2495         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2496         formats = []
2497
2498         def manifest_url(manifest):
2499             m_url = '%s/%s' % (http_base_url, manifest)
2500             if query:
2501                 m_url += '?%s' % query
2502             return m_url
2503
2504         if 'm3u8' not in skip_protocols:
2505             formats.extend(self._extract_m3u8_formats(
2506                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2507                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2508         if 'f4m' not in skip_protocols:
2509             formats.extend(self._extract_f4m_formats(
2510                 manifest_url('manifest.f4m'),
2511                 video_id, f4m_id='hds', fatal=False))
2512         if 'dash' not in skip_protocols:
2513             formats.extend(self._extract_mpd_formats(
2514                 manifest_url('manifest.mpd'),
2515                 video_id, mpd_id='dash', fatal=False))
2516         if re.search(r'(?:/smil:|\.smil)', url_base):
2517             if 'smil' not in skip_protocols:
2518                 rtmp_formats = self._extract_smil_formats(
2519                     manifest_url('jwplayer.smil'),
2520                     video_id, fatal=False)
2521                 for rtmp_format in rtmp_formats:
2522                     rtsp_format = rtmp_format.copy()
2523                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2524                     del rtsp_format['play_path']
2525                     del rtsp_format['ext']
2526                     rtsp_format.update({
2527                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2528                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2529                         'protocol': 'rtsp',
2530                     })
2531                     formats.extend([rtmp_format, rtsp_format])
2532         else:
2533             for protocol in ('rtmp', 'rtsp'):
2534                 if protocol not in skip_protocols:
2535                     formats.append({
2536                         'url': '%s:%s' % (protocol, url_base),
2537                         'format_id': protocol,
2538                         'protocol': protocol,
2539                     })
2540         return formats
2541
2542     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2543         mobj = re.search(
2544             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2545             webpage)
2546         if mobj:
2547             try:
2548                 jwplayer_data = self._parse_json(mobj.group('options'),
2549                                                  video_id=video_id,
2550                                                  transform_source=transform_source)
2551             except ExtractorError:
2552                 pass
2553             else:
2554                 if isinstance(jwplayer_data, dict):
2555                     return jwplayer_data
2556
2557     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2558         jwplayer_data = self._find_jwplayer_data(
2559             webpage, video_id, transform_source=js_to_json)
2560         return self._parse_jwplayer_data(
2561             jwplayer_data, video_id, *args, **kwargs)
2562
2563     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2564                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2565         # JWPlayer backward compatibility: flattened playlists
2566         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2567         if 'playlist' not in jwplayer_data:
2568             jwplayer_data = {'playlist': [jwplayer_data]}
2569
2570         entries = []
2571
2572         # JWPlayer backward compatibility: single playlist item
2573         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2574         if not isinstance(jwplayer_data['playlist'], list):
2575             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2576
2577         for video_data in jwplayer_data['playlist']:
2578             # JWPlayer backward compatibility: flattened sources
2579             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2580             if 'sources' not in video_data:
2581                 video_data['sources'] = [video_data]
2582
2583             this_video_id = video_id or video_data['mediaid']
2584
2585             formats = self._parse_jwplayer_formats(
2586                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2587                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2588
2589             subtitles = {}
2590             tracks = video_data.get('tracks')
2591             if tracks and isinstance(tracks, list):
2592                 for track in tracks:
2593                     if not isinstance(track, dict):
2594                         continue
2595                     track_kind = track.get('kind')
2596                     if not track_kind or not isinstance(track_kind, compat_str):
2597                         continue
2598                     if track_kind.lower() not in ('captions', 'subtitles'):
2599                         continue
2600                     track_url = urljoin(base_url, track.get('file'))
2601                     if not track_url:
2602                         continue
2603                     subtitles.setdefault(track.get('label') or 'en', []).append({
2604                         'url': self._proto_relative_url(track_url)
2605                     })
2606
2607             entry = {
2608                 'id': this_video_id,
2609                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2610                 'description': video_data.get('description'),
2611                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2612                 'timestamp': int_or_none(video_data.get('pubdate')),
2613                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2614                 'subtitles': subtitles,
2615             }
2616             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2617             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2618                 entry.update({
2619                     '_type': 'url_transparent',
2620                     'url': formats[0]['url'],
2621                 })
2622             else:
2623                 self._sort_formats(formats)
2624                 entry['formats'] = formats
2625             entries.append(entry)
2626         if len(entries) == 1:
2627             return entries[0]
2628         else:
2629             return self.playlist_result(entries)
2630
2631     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2632                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2633         urls = []
2634         formats = []
2635         for source in jwplayer_sources_data:
2636             if not isinstance(source, dict):
2637                 continue
2638             source_url = self._proto_relative_url(source.get('file'))
2639             if not source_url:
2640                 continue
2641             if base_url:
2642                 source_url = compat_urlparse.urljoin(base_url, source_url)
2643             if source_url in urls:
2644                 continue
2645             urls.append(source_url)
2646             source_type = source.get('type') or ''
2647             ext = mimetype2ext(source_type) or determine_ext(source_url)
2648             if source_type == 'hls' or ext == 'm3u8':
2649                 formats.extend(self._extract_m3u8_formats(
2650                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2651                     m3u8_id=m3u8_id, fatal=False))
2652             elif source_type == 'dash' or ext == 'mpd':
2653                 formats.extend(self._extract_mpd_formats(
2654                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2655             elif ext == 'smil':
2656                 formats.extend(self._extract_smil_formats(
2657                     source_url, video_id, fatal=False))
2658             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2659             elif source_type.startswith('audio') or ext in (
2660                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2661                 formats.append({
2662                     'url': source_url,
2663                     'vcodec': 'none',
2664                     'ext': ext,
2665                 })
2666             else:
2667                 height = int_or_none(source.get('height'))
2668                 if height is None:
2669                     # Often no height is provided but there is a label in
2670                     # format like "1080p", "720p SD", or 1080.
2671                     height = int_or_none(self._search_regex(
2672                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2673                         'height', default=None))
2674                 a_format = {
2675                     'url': source_url,
2676                     'width': int_or_none(source.get('width')),
2677                     'height': height,
2678                     'tbr': int_or_none(source.get('bitrate')),
2679                     'ext': ext,
2680                 }
2681                 if source_url.startswith('rtmp'):
2682                     a_format['ext'] = 'flv'
2683                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2684                     # of jwplayer.flash.swf
2685                     rtmp_url_parts = re.split(
2686                         r'((?:mp4|mp3|flv):)', source_url, 1)
2687                     if len(rtmp_url_parts) == 3:
2688                         rtmp_url, prefix, play_path = rtmp_url_parts
2689                         a_format.update({
2690                             'url': rtmp_url,
2691                             'play_path': prefix + play_path,
2692                         })
2693                     if rtmp_params:
2694                         a_format.update(rtmp_params)
2695                 formats.append(a_format)
2696         return formats
2697
2698     def _live_title(self, name):
2699         """ Generate the title for a live video """
2700         now = datetime.datetime.now()
2701         now_str = now.strftime('%Y-%m-%d %H:%M')
2702         return name + ' ' + now_str
2703
2704     def _int(self, v, name, fatal=False, **kwargs):
2705         res = int_or_none(v, **kwargs)
2706         if 'get_attr' in kwargs:
2707             print(getattr(v, kwargs['get_attr']))
2708         if res is None:
2709             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2710             if fatal:
2711                 raise ExtractorError(msg)
2712             else:
2713                 self._downloader.report_warning(msg)
2714         return res
2715
2716     def _float(self, v, name, fatal=False, **kwargs):
2717         res = float_or_none(v, **kwargs)
2718         if res is None:
2719             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2720             if fatal:
2721                 raise ExtractorError(msg)
2722             else:
2723                 self._downloader.report_warning(msg)
2724         return res
2725
2726     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2727                     path='/', secure=False, discard=False, rest={}, **kwargs):
2728         cookie = compat_cookiejar.Cookie(
2729             0, name, value, port, port is not None, domain, True,
2730             domain.startswith('.'), path, True, secure, expire_time,
2731             discard, None, None, rest)
2732         self._downloader.cookiejar.set_cookie(cookie)
2733
2734     def _get_cookies(self, url):
2735         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2736         req = sanitized_Request(url)
2737         self._downloader.cookiejar.add_cookie_header(req)
2738         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2739
2740     def get_testcases(self, include_onlymatching=False):
2741         t = getattr(self, '_TEST', None)
2742         if t:
2743             assert not hasattr(self, '_TESTS'), \
2744                 '%s has _TEST and _TESTS' % type(self).__name__
2745             tests = [t]
2746         else:
2747             tests = getattr(self, '_TESTS', [])
2748         for t in tests:
2749             if not include_onlymatching and t.get('only_matching', False):
2750                 continue
2751             t['name'] = type(self).__name__[:-len('IE')]
2752             yield t
2753
2754     def is_suitable(self, age_limit):
2755         """ Test whether the extractor is generally suitable for the given
2756         age limit (i.e. pornographic sites are not, all others usually are) """
2757
2758         any_restricted = False
2759         for tc in self.get_testcases(include_onlymatching=False):
2760             if tc.get('playlist', []):
2761                 tc = tc['playlist'][0]
2762             is_restricted = age_restricted(
2763                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2764             if not is_restricted:
2765                 return True
2766             any_restricted = any_restricted or is_restricted
2767         return not any_restricted
2768
2769     def extract_subtitles(self, *args, **kwargs):
2770         if (self._downloader.params.get('writesubtitles', False) or
2771                 self._downloader.params.get('listsubtitles')):
2772             return self._get_subtitles(*args, **kwargs)
2773         return {}
2774
2775     def _get_subtitles(self, *args, **kwargs):
2776         raise NotImplementedError('This method must be implemented by subclasses')
2777
2778     @staticmethod
2779     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2780         """ Merge subtitle items for one language. Items with duplicated URLs
2781         will be dropped. """
2782         list1_urls = set([item['url'] for item in subtitle_list1])
2783         ret = list(subtitle_list1)
2784         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2785         return ret
2786
2787     @classmethod
2788     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2789         """ Merge two subtitle dictionaries, language by language. """
2790         ret = dict(subtitle_dict1)
2791         for lang in subtitle_dict2:
2792             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2793         return ret
2794
2795     def extract_automatic_captions(self, *args, **kwargs):
2796         if (self._downloader.params.get('writeautomaticsub', False) or
2797                 self._downloader.params.get('listsubtitles')):
2798             return self._get_automatic_captions(*args, **kwargs)
2799         return {}
2800
2801     def _get_automatic_captions(self, *args, **kwargs):
2802         raise NotImplementedError('This method must be implemented by subclasses')
2803
2804     def mark_watched(self, *args, **kwargs):
2805         if (self._downloader.params.get('mark_watched', False) and
2806                 (self._get_login_info()[0] is not None or
2807                     self._downloader.params.get('cookiefile') is not None)):
2808             self._mark_watched(*args, **kwargs)
2809
2810     def _mark_watched(self, *args, **kwargs):
2811         raise NotImplementedError('This method must be implemented by subclasses')
2812
2813     def geo_verification_headers(self):
2814         headers = {}
2815         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2816         if geo_verification_proxy:
2817             headers['Ytdl-request-proxy'] = geo_verification_proxy
2818         return headers
2819
2820     def _generic_id(self, url):
2821         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2822
2823     def _generic_title(self, url):
2824         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2825
2826
2827 class SearchInfoExtractor(InfoExtractor):
2828     """
2829     Base class for paged search queries extractors.
2830     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2831     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2832     """
2833
2834     @classmethod
2835     def _make_valid_url(cls):
2836         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2837
2838     @classmethod
2839     def suitable(cls, url):
2840         return re.match(cls._make_valid_url(), url) is not None
2841
2842     def _real_extract(self, query):
2843         mobj = re.match(self._make_valid_url(), query)
2844         if mobj is None:
2845             raise ExtractorError('Invalid search query "%s"' % query)
2846
2847         prefix = mobj.group('prefix')
2848         query = mobj.group('query')
2849         if prefix == '':
2850             return self._get_n_results(query, 1)
2851         elif prefix == 'all':
2852             return self._get_n_results(query, self._MAX_RESULTS)
2853         else:
2854             n = int(prefix)
2855             if n <= 0:
2856                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2857             elif n > self._MAX_RESULTS:
2858                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2859                 n = self._MAX_RESULTS
2860             return self._get_n_results(query, n)
2861
2862     def _get_n_results(self, query, n):
2863         """Get a specified number of results for a query"""
2864         raise NotImplementedError('This method must be implemented by subclasses')
2865
2866     @property
2867     def SEARCH_KEY(self):
2868         return self._SEARCH_KEY