[extractor/common] Properly escape % in MPD templates (closes #16867)
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     mimetype2ext,
56     orderedSet,
57     parse_codecs,
58     parse_duration,
59     parse_iso8601,
60     parse_m3u8_attributes,
61     RegexNotFoundError,
62     sanitized_Request,
63     sanitize_filename,
64     unescapeHTML,
65     unified_strdate,
66     unified_timestamp,
67     update_Request,
68     update_url_query,
69     urljoin,
70     url_basename,
71     xpath_element,
72     xpath_text,
73     xpath_with_ns,
74 )
75
76
77 class InfoExtractor(object):
78     """Information Extractor class.
79
80     Information extractors are the classes that, given a URL, extract
81     information about the video (or videos) the URL refers to. This
82     information includes the real video URL, the video title, author and
83     others. The information is stored in a dictionary which is then
84     passed to the YoutubeDL. The YoutubeDL processes this
85     information possibly downloading the video to the file system, among
86     other possible outcomes.
87
88     The type field determines the type of the result.
89     By far the most common value (and the default if _type is missing) is
90     "video", which indicates a single video.
91
92     For a video, the dictionaries must include the following fields:
93
94     id:             Video identifier.
95     title:          Video title, unescaped.
96
97     Additionally, it must contain either a formats entry or a url one:
98
99     formats:        A list of dictionaries for each format available, ordered
100                     from worst to best quality.
101
102                     Potential fields:
103                     * url        Mandatory. The URL of the video file
104                     * manifest_url
105                                  The URL of the manifest file in case of
106                                  fragmented media (DASH, hls, hds)
107                     * ext        Will be calculated from URL if missing
108                     * format     A human-readable description of the format
109                                  ("mp4 container with h264/opus").
110                                  Calculated from the format_id, width, height.
111                                  and format_note fields if missing.
112                     * format_id  A short description of the format
113                                  ("mp4_h264_opus" or "19").
114                                 Technically optional, but strongly recommended.
115                     * format_note Additional info about the format
116                                  ("3D" or "DASH video")
117                     * width      Width of the video, if known
118                     * height     Height of the video, if known
119                     * resolution Textual description of width and height
120                     * tbr        Average bitrate of audio and video in KBit/s
121                     * abr        Average audio bitrate in KBit/s
122                     * acodec     Name of the audio codec in use
123                     * asr        Audio sampling rate in Hertz
124                     * vbr        Average video bitrate in KBit/s
125                     * fps        Frame rate
126                     * vcodec     Name of the video codec in use
127                     * container  Name of the container format
128                     * filesize   The number of bytes, if known in advance
129                     * filesize_approx  An estimate for the number of bytes
130                     * player_url SWF Player URL (used for rtmpdump).
131                     * protocol   The protocol that will be used for the actual
132                                  download, lower-case.
133                                  "http", "https", "rtsp", "rtmp", "rtmpe",
134                                  "m3u8", "m3u8_native" or "http_dash_segments".
135                     * fragment_base_url
136                                  Base URL for fragments. Each fragment's path
137                                  value (if present) will be relative to
138                                  this URL.
139                     * fragments  A list of fragments of a fragmented media.
140                                  Each fragment entry must contain either an url
141                                  or a path. If an url is present it should be
142                                  considered by a client. Otherwise both path and
143                                  fragment_base_url must be present. Here is
144                                  the list of all potential fields:
145                                  * "url" - fragment's URL
146                                  * "path" - fragment's path relative to
147                                             fragment_base_url
148                                  * "duration" (optional, int or float)
149                                  * "filesize" (optional, int)
150                     * preference Order number of this format. If this field is
151                                  present and not None, the formats get sorted
152                                  by this field, regardless of all other values.
153                                  -1 for default (order by other properties),
154                                  -2 or smaller for less than default.
155                                  < -1000 to hide the format (if there is
156                                     another one which is strictly better)
157                     * language   Language code, e.g. "de" or "en-US".
158                     * language_preference  Is this in the language mentioned in
159                                  the URL?
160                                  10 if it's what the URL is about,
161                                  -1 for default (don't know),
162                                  -10 otherwise, other values reserved for now.
163                     * quality    Order number of the video quality of this
164                                  format, irrespective of the file format.
165                                  -1 for default (order by other properties),
166                                  -2 or smaller for less than default.
167                     * source_preference  Order number for this video source
168                                   (quality takes higher priority)
169                                  -1 for default (order by other properties),
170                                  -2 or smaller for less than default.
171                     * http_headers  A dictionary of additional HTTP headers
172                                  to add to the request.
173                     * stretched_ratio  If given and not 1, indicates that the
174                                  video's pixels are not square.
175                                  width : height ratio as float.
176                     * no_resume  The server does not support resuming the
177                                  (HTTP or RTMP) download. Boolean.
178                     * downloader_options  A dictionary of downloader options as
179                                  described in FileDownloader
180
181     url:            Final video URL.
182     ext:            Video filename extension.
183     format:         The video format, defaults to ext (used for --get-format)
184     player_url:     SWF Player URL (used for rtmpdump).
185
186     The following fields are optional:
187
188     alt_title:      A secondary title of the video.
189     display_id      An alternative identifier for the video, not necessarily
190                     unique, but available before title. Typically, id is
191                     something like "4234987", title "Dancing naked mole rats",
192                     and display_id "dancing-naked-mole-rats"
193     thumbnails:     A list of dictionaries, with the following entries:
194                         * "id" (optional, string) - Thumbnail format ID
195                         * "url"
196                         * "preference" (optional, int) - quality of the image
197                         * "width" (optional, int)
198                         * "height" (optional, int)
199                         * "resolution" (optional, string "{width}x{height"},
200                                         deprecated)
201                         * "filesize" (optional, int)
202     thumbnail:      Full URL to a video thumbnail image.
203     description:    Full video description.
204     uploader:       Full name of the video uploader.
205     license:        License name the video is licensed under.
206     creator:        The creator of the video.
207     release_date:   The date (YYYYMMDD) when the video was released.
208     timestamp:      UNIX timestamp of the moment the video became available.
209     upload_date:    Video upload date (YYYYMMDD).
210                     If not explicitly set, calculated from timestamp.
211     uploader_id:    Nickname or id of the video uploader.
212     uploader_url:   Full URL to a personal webpage of the video uploader.
213     location:       Physical location where the video was filmed.
214     subtitles:      The available subtitles as a dictionary in the format
215                     {tag: subformats}. "tag" is usually a language code, and
216                     "subformats" is a list sorted from lower to higher
217                     preference, each element is a dictionary with the "ext"
218                     entry and one of:
219                         * "data": The subtitles file contents
220                         * "url": A URL pointing to the subtitles file
221                     "ext" will be calculated from URL if missing
222     automatic_captions: Like 'subtitles', used by the YoutubeIE for
223                     automatically generated captions
224     duration:       Length of the video in seconds, as an integer or float.
225     view_count:     How many users have watched the video on the platform.
226     like_count:     Number of positive ratings of the video
227     dislike_count:  Number of negative ratings of the video
228     repost_count:   Number of reposts of the video
229     average_rating: Average rating give by users, the scale used depends on the webpage
230     comment_count:  Number of comments on the video
231     comments:       A list of comments, each with one or more of the following
232                     properties (all but one of text or html optional):
233                         * "author" - human-readable name of the comment author
234                         * "author_id" - user ID of the comment author
235                         * "id" - Comment ID
236                         * "html" - Comment as HTML
237                         * "text" - Plain text of the comment
238                         * "timestamp" - UNIX timestamp of comment
239                         * "parent" - ID of the comment this one is replying to.
240                                      Set to "root" to indicate that this is a
241                                      comment to the original video.
242     age_limit:      Age restriction for the video, as an integer (years)
243     webpage_url:    The URL to the video webpage, if given to youtube-dl it
244                     should allow to get the same result again. (It will be set
245                     by YoutubeDL if it's missing)
246     categories:     A list of categories that the video falls in, for example
247                     ["Sports", "Berlin"]
248     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
249     is_live:        True, False, or None (=unknown). Whether this video is a
250                     live stream that goes on instead of a fixed-length video.
251     start_time:     Time in seconds where the reproduction should start, as
252                     specified in the URL.
253     end_time:       Time in seconds where the reproduction should end, as
254                     specified in the URL.
255     chapters:       A list of dictionaries, with the following entries:
256                         * "start_time" - The start time of the chapter in seconds
257                         * "end_time" - The end time of the chapter in seconds
258                         * "title" (optional, string)
259
260     The following fields should only be used when the video belongs to some logical
261     chapter or section:
262
263     chapter:        Name or title of the chapter the video belongs to.
264     chapter_number: Number of the chapter the video belongs to, as an integer.
265     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
266
267     The following fields should only be used when the video is an episode of some
268     series, programme or podcast:
269
270     series:         Title of the series or programme the video episode belongs to.
271     season:         Title of the season the video episode belongs to.
272     season_number:  Number of the season the video episode belongs to, as an integer.
273     season_id:      Id of the season the video episode belongs to, as a unicode string.
274     episode:        Title of the video episode. Unlike mandatory video title field,
275                     this field should denote the exact title of the video episode
276                     without any kind of decoration.
277     episode_number: Number of the video episode within a season, as an integer.
278     episode_id:     Id of the video episode, as a unicode string.
279
280     The following fields should only be used when the media is a track or a part of
281     a music album:
282
283     track:          Title of the track.
284     track_number:   Number of the track within an album or a disc, as an integer.
285     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
286                     as a unicode string.
287     artist:         Artist(s) of the track.
288     genre:          Genre(s) of the track.
289     album:          Title of the album the track belongs to.
290     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
291     album_artist:   List of all artists appeared on the album (e.g.
292                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
293                     and compilations).
294     disc_number:    Number of the disc or other physical medium the track belongs to,
295                     as an integer.
296     release_year:   Year (YYYY) when the album was released.
297
298     Unless mentioned otherwise, the fields should be Unicode strings.
299
300     Unless mentioned otherwise, None is equivalent to absence of information.
301
302
303     _type "playlist" indicates multiple videos.
304     There must be a key "entries", which is a list, an iterable, or a PagedList
305     object, each element of which is a valid dictionary by this specification.
306
307     Additionally, playlists can have "id", "title", "description", "uploader",
308     "uploader_id", "uploader_url" attributes with the same semantics as videos
309     (see above).
310
311
312     _type "multi_video" indicates that there are multiple videos that
313     form a single show, for examples multiple acts of an opera or TV episode.
314     It must have an entries key like a playlist and contain all the keys
315     required for a video at the same time.
316
317
318     _type "url" indicates that the video must be extracted from another
319     location, possibly by a different extractor. Its only required key is:
320     "url" - the next URL to extract.
321     The key "ie_key" can be set to the class name (minus the trailing "IE",
322     e.g. "Youtube") if the extractor class is known in advance.
323     Additionally, the dictionary may have any properties of the resolved entity
324     known in advance, for example "title" if the title of the referred video is
325     known ahead of time.
326
327
328     _type "url_transparent" entities have the same specification as "url", but
329     indicate that the given additional information is more precise than the one
330     associated with the resolved URL.
331     This is useful when a site employs a video service that hosts the video and
332     its technical metadata, but that video service does not embed a useful
333     title, description etc.
334
335
336     Subclasses of this one should re-define the _real_initialize() and
337     _real_extract() methods and define a _VALID_URL regexp.
338     Probably, they should also be added to the list of extractors.
339
340     _GEO_BYPASS attribute may be set to False in order to disable
341     geo restriction bypass mechanisms for a particular extractor.
342     Though it won't disable explicit geo restriction bypass based on
343     country code provided with geo_bypass_country.
344
345     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
346     countries for this extractor. One of these countries will be used by
347     geo restriction bypass mechanism right away in order to bypass
348     geo restriction, of course, if the mechanism is not disabled.
349
350     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
351     IP blocks in CIDR notation for this extractor. One of these IP blocks
352     will be used by geo restriction bypass mechanism similarly
353     to _GEO_COUNTRIES.
354
355     Finally, the _WORKING attribute should be set to False for broken IEs
356     in order to warn the users and skip the tests.
357     """
358
359     _ready = False
360     _downloader = None
361     _x_forwarded_for_ip = None
362     _GEO_BYPASS = True
363     _GEO_COUNTRIES = None
364     _GEO_IP_BLOCKS = None
365     _WORKING = True
366
367     def __init__(self, downloader=None):
368         """Constructor. Receives an optional downloader."""
369         self._ready = False
370         self._x_forwarded_for_ip = None
371         self.set_downloader(downloader)
372
373     @classmethod
374     def suitable(cls, url):
375         """Receives a URL and returns True if suitable for this IE."""
376
377         # This does not use has/getattr intentionally - we want to know whether
378         # we have cached the regexp for *this* class, whereas getattr would also
379         # match the superclass
380         if '_VALID_URL_RE' not in cls.__dict__:
381             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
382         return cls._VALID_URL_RE.match(url) is not None
383
384     @classmethod
385     def _match_id(cls, url):
386         if '_VALID_URL_RE' not in cls.__dict__:
387             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
388         m = cls._VALID_URL_RE.match(url)
389         assert m
390         return compat_str(m.group('id'))
391
392     @classmethod
393     def working(cls):
394         """Getter method for _WORKING."""
395         return cls._WORKING
396
397     def initialize(self):
398         """Initializes an instance (authentication, etc)."""
399         self._initialize_geo_bypass({
400             'countries': self._GEO_COUNTRIES,
401             'ip_blocks': self._GEO_IP_BLOCKS,
402         })
403         if not self._ready:
404             self._real_initialize()
405             self._ready = True
406
407     def _initialize_geo_bypass(self, geo_bypass_context):
408         """
409         Initialize geo restriction bypass mechanism.
410
411         This method is used to initialize geo bypass mechanism based on faking
412         X-Forwarded-For HTTP header. A random country from provided country list
413         is selected and a random IP belonging to this country is generated. This
414         IP will be passed as X-Forwarded-For HTTP header in all subsequent
415         HTTP requests.
416
417         This method will be used for initial geo bypass mechanism initialization
418         during the instance initialization with _GEO_COUNTRIES and
419         _GEO_IP_BLOCKS.
420
421         You may also manually call it from extractor's code if geo bypass
422         information is not available beforehand (e.g. obtained during
423         extraction) or due to some other reason. In this case you should pass
424         this information in geo bypass context passed as first argument. It may
425         contain following fields:
426
427         countries:  List of geo unrestricted countries (similar
428                     to _GEO_COUNTRIES)
429         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
430                     (similar to _GEO_IP_BLOCKS)
431
432         """
433         if not self._x_forwarded_for_ip:
434
435             # Geo bypass mechanism is explicitly disabled by user
436             if not self._downloader.params.get('geo_bypass', True):
437                 return
438
439             if not geo_bypass_context:
440                 geo_bypass_context = {}
441
442             # Backward compatibility: previously _initialize_geo_bypass
443             # expected a list of countries, some 3rd party code may still use
444             # it this way
445             if isinstance(geo_bypass_context, (list, tuple)):
446                 geo_bypass_context = {
447                     'countries': geo_bypass_context,
448                 }
449
450             # The whole point of geo bypass mechanism is to fake IP
451             # as X-Forwarded-For HTTP header based on some IP block or
452             # country code.
453
454             # Path 1: bypassing based on IP block in CIDR notation
455
456             # Explicit IP block specified by user, use it right away
457             # regardless of whether extractor is geo bypassable or not
458             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
459
460             # Otherwise use random IP block from geo bypass context but only
461             # if extractor is known as geo bypassable
462             if not ip_block:
463                 ip_blocks = geo_bypass_context.get('ip_blocks')
464                 if self._GEO_BYPASS and ip_blocks:
465                     ip_block = random.choice(ip_blocks)
466
467             if ip_block:
468                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
469                 if self._downloader.params.get('verbose', False):
470                     self._downloader.to_screen(
471                         '[debug] Using fake IP %s as X-Forwarded-For.'
472                         % self._x_forwarded_for_ip)
473                 return
474
475             # Path 2: bypassing based on country code
476
477             # Explicit country code specified by user, use it right away
478             # regardless of whether extractor is geo bypassable or not
479             country = self._downloader.params.get('geo_bypass_country', None)
480
481             # Otherwise use random country code from geo bypass context but
482             # only if extractor is known as geo bypassable
483             if not country:
484                 countries = geo_bypass_context.get('countries')
485                 if self._GEO_BYPASS and countries:
486                     country = random.choice(countries)
487
488             if country:
489                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
490                 if self._downloader.params.get('verbose', False):
491                     self._downloader.to_screen(
492                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
493                         % (self._x_forwarded_for_ip, country.upper()))
494
495     def extract(self, url):
496         """Extracts URL information and returns it in list of dicts."""
497         try:
498             for _ in range(2):
499                 try:
500                     self.initialize()
501                     ie_result = self._real_extract(url)
502                     if self._x_forwarded_for_ip:
503                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
504                     return ie_result
505                 except GeoRestrictedError as e:
506                     if self.__maybe_fake_ip_and_retry(e.countries):
507                         continue
508                     raise
509         except ExtractorError:
510             raise
511         except compat_http_client.IncompleteRead as e:
512             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
513         except (KeyError, StopIteration) as e:
514             raise ExtractorError('An extractor error has occurred.', cause=e)
515
516     def __maybe_fake_ip_and_retry(self, countries):
517         if (not self._downloader.params.get('geo_bypass_country', None) and
518                 self._GEO_BYPASS and
519                 self._downloader.params.get('geo_bypass', True) and
520                 not self._x_forwarded_for_ip and
521                 countries):
522             country_code = random.choice(countries)
523             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
524             if self._x_forwarded_for_ip:
525                 self.report_warning(
526                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
527                     % (self._x_forwarded_for_ip, country_code.upper()))
528                 return True
529         return False
530
531     def set_downloader(self, downloader):
532         """Sets the downloader for this IE."""
533         self._downloader = downloader
534
535     def _real_initialize(self):
536         """Real initialization process. Redefine in subclasses."""
537         pass
538
539     def _real_extract(self, url):
540         """Real extraction process. Redefine in subclasses."""
541         pass
542
543     @classmethod
544     def ie_key(cls):
545         """A string for getting the InfoExtractor with get_info_extractor"""
546         return compat_str(cls.__name__[:-2])
547
548     @property
549     def IE_NAME(self):
550         return compat_str(type(self).__name__[:-2])
551
552     @staticmethod
553     def __can_accept_status_code(err, expected_status):
554         assert isinstance(err, compat_urllib_error.HTTPError)
555         if expected_status is None:
556             return False
557         if isinstance(expected_status, compat_integer_types):
558             return err.code == expected_status
559         elif isinstance(expected_status, (list, tuple)):
560             return err.code in expected_status
561         elif callable(expected_status):
562             return expected_status(err.code) is True
563         else:
564             assert False
565
566     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
567         """
568         Return the response handle.
569
570         See _download_webpage docstring for arguments specification.
571         """
572         if note is None:
573             self.report_download_webpage(video_id)
574         elif note is not False:
575             if video_id is None:
576                 self.to_screen('%s' % (note,))
577             else:
578                 self.to_screen('%s: %s' % (video_id, note))
579
580         # Some sites check X-Forwarded-For HTTP header in order to figure out
581         # the origin of the client behind proxy. This allows bypassing geo
582         # restriction by faking this header's value to IP that belongs to some
583         # geo unrestricted country. We will do so once we encounter any
584         # geo restriction error.
585         if self._x_forwarded_for_ip:
586             if 'X-Forwarded-For' not in headers:
587                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
588
589         if isinstance(url_or_request, compat_urllib_request.Request):
590             url_or_request = update_Request(
591                 url_or_request, data=data, headers=headers, query=query)
592         else:
593             if query:
594                 url_or_request = update_url_query(url_or_request, query)
595             if data is not None or headers:
596                 url_or_request = sanitized_Request(url_or_request, data, headers)
597         try:
598             return self._downloader.urlopen(url_or_request)
599         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
600             if isinstance(err, compat_urllib_error.HTTPError):
601                 if self.__can_accept_status_code(err, expected_status):
602                     return err.fp
603
604             if errnote is False:
605                 return False
606             if errnote is None:
607                 errnote = 'Unable to download webpage'
608
609             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
610             if fatal:
611                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
612             else:
613                 self._downloader.report_warning(errmsg)
614                 return False
615
616     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
617         """
618         Return a tuple (page content as string, URL handle).
619
620         See _download_webpage docstring for arguments specification.
621         """
622         # Strip hashes from the URL (#1038)
623         if isinstance(url_or_request, (compat_str, str)):
624             url_or_request = url_or_request.partition('#')[0]
625
626         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
627         if urlh is False:
628             assert not fatal
629             return False
630         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
631         return (content, urlh)
632
633     @staticmethod
634     def _guess_encoding_from_content(content_type, webpage_bytes):
635         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
636         if m:
637             encoding = m.group(1)
638         else:
639             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
640                           webpage_bytes[:1024])
641             if m:
642                 encoding = m.group(1).decode('ascii')
643             elif webpage_bytes.startswith(b'\xff\xfe'):
644                 encoding = 'utf-16'
645             else:
646                 encoding = 'utf-8'
647
648         return encoding
649
650     def __check_blocked(self, content):
651         first_block = content[:512]
652         if ('<title>Access to this site is blocked</title>' in content and
653                 'Websense' in first_block):
654             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
655             blocked_iframe = self._html_search_regex(
656                 r'<iframe src="([^"]+)"', content,
657                 'Websense information URL', default=None)
658             if blocked_iframe:
659                 msg += ' Visit %s for more details' % blocked_iframe
660             raise ExtractorError(msg, expected=True)
661         if '<title>The URL you requested has been blocked</title>' in first_block:
662             msg = (
663                 'Access to this webpage has been blocked by Indian censorship. '
664                 'Use a VPN or proxy server (with --proxy) to route around it.')
665             block_msg = self._html_search_regex(
666                 r'</h1><p>(.*?)</p>',
667                 content, 'block message', default=None)
668             if block_msg:
669                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
670             raise ExtractorError(msg, expected=True)
671         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
672                 'blocklist.rkn.gov.ru' in content):
673             raise ExtractorError(
674                 'Access to this webpage has been blocked by decision of the Russian government. '
675                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
676                 expected=True)
677
678     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
679         content_type = urlh.headers.get('Content-Type', '')
680         webpage_bytes = urlh.read()
681         if prefix is not None:
682             webpage_bytes = prefix + webpage_bytes
683         if not encoding:
684             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
685         if self._downloader.params.get('dump_intermediate_pages', False):
686             self.to_screen('Dumping request to ' + urlh.geturl())
687             dump = base64.b64encode(webpage_bytes).decode('ascii')
688             self._downloader.to_screen(dump)
689         if self._downloader.params.get('write_pages', False):
690             basen = '%s_%s' % (video_id, urlh.geturl())
691             if len(basen) > 240:
692                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
693                 basen = basen[:240 - len(h)] + h
694             raw_filename = basen + '.dump'
695             filename = sanitize_filename(raw_filename, restricted=True)
696             self.to_screen('Saving request to ' + filename)
697             # Working around MAX_PATH limitation on Windows (see
698             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
699             if compat_os_name == 'nt':
700                 absfilepath = os.path.abspath(filename)
701                 if len(absfilepath) > 259:
702                     filename = '\\\\?\\' + absfilepath
703             with open(filename, 'wb') as outf:
704                 outf.write(webpage_bytes)
705
706         try:
707             content = webpage_bytes.decode(encoding, 'replace')
708         except LookupError:
709             content = webpage_bytes.decode('utf-8', 'replace')
710
711         self.__check_blocked(content)
712
713         return content
714
715     def _download_webpage(
716             self, url_or_request, video_id, note=None, errnote=None,
717             fatal=True, tries=1, timeout=5, encoding=None, data=None,
718             headers={}, query={}, expected_status=None):
719         """
720         Return the data of the page as a string.
721
722         Arguments:
723         url_or_request -- plain text URL as a string or
724             a compat_urllib_request.Requestobject
725         video_id -- Video/playlist/item identifier (string)
726
727         Keyword arguments:
728         note -- note printed before downloading (string)
729         errnote -- note printed in case of an error (string)
730         fatal -- flag denoting whether error should be considered fatal,
731             i.e. whether it should cause ExtractionError to be raised,
732             otherwise a warning will be reported and extraction continued
733         tries -- number of tries
734         timeout -- sleep interval between tries
735         encoding -- encoding for a page content decoding, guessed automatically
736             when not explicitly specified
737         data -- POST data (bytes)
738         headers -- HTTP headers (dict)
739         query -- URL query (dict)
740         expected_status -- allows to accept failed HTTP requests (non 2xx
741             status code) by explicitly specifying a set of accepted status
742             codes. Can be any of the following entities:
743                 - an integer type specifying an exact failed status code to
744                   accept
745                 - a list or a tuple of integer types specifying a list of
746                   failed status codes to accept
747                 - a callable accepting an actual failed status code and
748                   returning True if it should be accepted
749             Note that this argument does not affect success status codes (2xx)
750             which are always accepted.
751         """
752
753         success = False
754         try_count = 0
755         while success is False:
756             try:
757                 res = self._download_webpage_handle(
758                     url_or_request, video_id, note, errnote, fatal,
759                     encoding=encoding, data=data, headers=headers, query=query,
760                     expected_status=expected_status)
761                 success = True
762             except compat_http_client.IncompleteRead as e:
763                 try_count += 1
764                 if try_count >= tries:
765                     raise e
766                 self._sleep(timeout, video_id)
767         if res is False:
768             return res
769         else:
770             content, _ = res
771             return content
772
773     def _download_xml_handle(
774             self, url_or_request, video_id, note='Downloading XML',
775             errnote='Unable to download XML', transform_source=None,
776             fatal=True, encoding=None, data=None, headers={}, query={},
777             expected_status=None):
778         """
779         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
780
781         See _download_webpage docstring for arguments specification.
782         """
783         res = self._download_webpage_handle(
784             url_or_request, video_id, note, errnote, fatal=fatal,
785             encoding=encoding, data=data, headers=headers, query=query,
786             expected_status=expected_status)
787         if res is False:
788             return res
789         xml_string, urlh = res
790         return self._parse_xml(
791             xml_string, video_id, transform_source=transform_source,
792             fatal=fatal), urlh
793
794     def _download_xml(
795             self, url_or_request, video_id,
796             note='Downloading XML', errnote='Unable to download XML',
797             transform_source=None, fatal=True, encoding=None,
798             data=None, headers={}, query={}, expected_status=None):
799         """
800         Return the xml as an xml.etree.ElementTree.Element.
801
802         See _download_webpage docstring for arguments specification.
803         """
804         res = self._download_xml_handle(
805             url_or_request, video_id, note=note, errnote=errnote,
806             transform_source=transform_source, fatal=fatal, encoding=encoding,
807             data=data, headers=headers, query=query,
808             expected_status=expected_status)
809         return res if res is False else res[0]
810
811     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
812         if transform_source:
813             xml_string = transform_source(xml_string)
814         try:
815             return compat_etree_fromstring(xml_string.encode('utf-8'))
816         except compat_xml_parse_error as ve:
817             errmsg = '%s: Failed to parse XML ' % video_id
818             if fatal:
819                 raise ExtractorError(errmsg, cause=ve)
820             else:
821                 self.report_warning(errmsg + str(ve))
822
823     def _download_json_handle(
824             self, url_or_request, video_id, note='Downloading JSON metadata',
825             errnote='Unable to download JSON metadata', transform_source=None,
826             fatal=True, encoding=None, data=None, headers={}, query={},
827             expected_status=None):
828         """
829         Return a tuple (JSON object, URL handle).
830
831         See _download_webpage docstring for arguments specification.
832         """
833         res = self._download_webpage_handle(
834             url_or_request, video_id, note, errnote, fatal=fatal,
835             encoding=encoding, data=data, headers=headers, query=query,
836             expected_status=expected_status)
837         if res is False:
838             return res
839         json_string, urlh = res
840         return self._parse_json(
841             json_string, video_id, transform_source=transform_source,
842             fatal=fatal), urlh
843
844     def _download_json(
845             self, url_or_request, video_id, note='Downloading JSON metadata',
846             errnote='Unable to download JSON metadata', transform_source=None,
847             fatal=True, encoding=None, data=None, headers={}, query={},
848             expected_status=None):
849         """
850         Return the JSON object as a dict.
851
852         See _download_webpage docstring for arguments specification.
853         """
854         res = self._download_json_handle(
855             url_or_request, video_id, note=note, errnote=errnote,
856             transform_source=transform_source, fatal=fatal, encoding=encoding,
857             data=data, headers=headers, query=query,
858             expected_status=expected_status)
859         return res if res is False else res[0]
860
861     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
862         if transform_source:
863             json_string = transform_source(json_string)
864         try:
865             return json.loads(json_string)
866         except ValueError as ve:
867             errmsg = '%s: Failed to parse JSON ' % video_id
868             if fatal:
869                 raise ExtractorError(errmsg, cause=ve)
870             else:
871                 self.report_warning(errmsg + str(ve))
872
873     def report_warning(self, msg, video_id=None):
874         idstr = '' if video_id is None else '%s: ' % video_id
875         self._downloader.report_warning(
876             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
877
878     def to_screen(self, msg):
879         """Print msg to screen, prefixing it with '[ie_name]'"""
880         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
881
882     def report_extraction(self, id_or_name):
883         """Report information extraction."""
884         self.to_screen('%s: Extracting information' % id_or_name)
885
886     def report_download_webpage(self, video_id):
887         """Report webpage download."""
888         self.to_screen('%s: Downloading webpage' % video_id)
889
890     def report_age_confirmation(self):
891         """Report attempt to confirm age."""
892         self.to_screen('Confirming age')
893
894     def report_login(self):
895         """Report attempt to log in."""
896         self.to_screen('Logging in')
897
898     @staticmethod
899     def raise_login_required(msg='This video is only available for registered users'):
900         raise ExtractorError(
901             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
902             expected=True)
903
904     @staticmethod
905     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
906         raise GeoRestrictedError(msg, countries=countries)
907
908     # Methods for following #608
909     @staticmethod
910     def url_result(url, ie=None, video_id=None, video_title=None):
911         """Returns a URL that points to a page that should be processed"""
912         # TODO: ie should be the class used for getting the info
913         video_info = {'_type': 'url',
914                       'url': url,
915                       'ie_key': ie}
916         if video_id is not None:
917             video_info['id'] = video_id
918         if video_title is not None:
919             video_info['title'] = video_title
920         return video_info
921
922     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
923         urls = orderedSet(
924             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
925             for m in matches)
926         return self.playlist_result(
927             urls, playlist_id=playlist_id, playlist_title=playlist_title)
928
929     @staticmethod
930     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
931         """Returns a playlist"""
932         video_info = {'_type': 'playlist',
933                       'entries': entries}
934         if playlist_id:
935             video_info['id'] = playlist_id
936         if playlist_title:
937             video_info['title'] = playlist_title
938         if playlist_description:
939             video_info['description'] = playlist_description
940         return video_info
941
942     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
943         """
944         Perform a regex search on the given string, using a single or a list of
945         patterns returning the first matching group.
946         In case of failure return a default value or raise a WARNING or a
947         RegexNotFoundError, depending on fatal, specifying the field name.
948         """
949         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
950             mobj = re.search(pattern, string, flags)
951         else:
952             for p in pattern:
953                 mobj = re.search(p, string, flags)
954                 if mobj:
955                     break
956
957         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
958             _name = '\033[0;34m%s\033[0m' % name
959         else:
960             _name = name
961
962         if mobj:
963             if group is None:
964                 # return the first matching group
965                 return next(g for g in mobj.groups() if g is not None)
966             else:
967                 return mobj.group(group)
968         elif default is not NO_DEFAULT:
969             return default
970         elif fatal:
971             raise RegexNotFoundError('Unable to extract %s' % _name)
972         else:
973             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
974             return None
975
976     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
977         """
978         Like _search_regex, but strips HTML tags and unescapes entities.
979         """
980         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
981         if res:
982             return clean_html(res).strip()
983         else:
984             return res
985
986     def _get_netrc_login_info(self, netrc_machine=None):
987         username = None
988         password = None
989         netrc_machine = netrc_machine or self._NETRC_MACHINE
990
991         if self._downloader.params.get('usenetrc', False):
992             try:
993                 info = netrc.netrc().authenticators(netrc_machine)
994                 if info is not None:
995                     username = info[0]
996                     password = info[2]
997                 else:
998                     raise netrc.NetrcParseError(
999                         'No authenticators for %s' % netrc_machine)
1000             except (IOError, netrc.NetrcParseError) as err:
1001                 self._downloader.report_warning(
1002                     'parsing .netrc: %s' % error_to_compat_str(err))
1003
1004         return username, password
1005
1006     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1007         """
1008         Get the login info as (username, password)
1009         First look for the manually specified credentials using username_option
1010         and password_option as keys in params dictionary. If no such credentials
1011         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1012         value.
1013         If there's no info available, return (None, None)
1014         """
1015         if self._downloader is None:
1016             return (None, None)
1017
1018         downloader_params = self._downloader.params
1019
1020         # Attempt to use provided username and password or .netrc data
1021         if downloader_params.get(username_option) is not None:
1022             username = downloader_params[username_option]
1023             password = downloader_params[password_option]
1024         else:
1025             username, password = self._get_netrc_login_info(netrc_machine)
1026
1027         return username, password
1028
1029     def _get_tfa_info(self, note='two-factor verification code'):
1030         """
1031         Get the two-factor authentication info
1032         TODO - asking the user will be required for sms/phone verify
1033         currently just uses the command line option
1034         If there's no info available, return None
1035         """
1036         if self._downloader is None:
1037             return None
1038         downloader_params = self._downloader.params
1039
1040         if downloader_params.get('twofactor') is not None:
1041             return downloader_params['twofactor']
1042
1043         return compat_getpass('Type %s and press [Return]: ' % note)
1044
1045     # Helper functions for extracting OpenGraph info
1046     @staticmethod
1047     def _og_regexes(prop):
1048         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1049         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1050                        % {'prop': re.escape(prop)})
1051         template = r'<meta[^>]+?%s[^>]+?%s'
1052         return [
1053             template % (property_re, content_re),
1054             template % (content_re, property_re),
1055         ]
1056
1057     @staticmethod
1058     def _meta_regex(prop):
1059         return r'''(?isx)<meta
1060                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1061                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1062
1063     def _og_search_property(self, prop, html, name=None, **kargs):
1064         if not isinstance(prop, (list, tuple)):
1065             prop = [prop]
1066         if name is None:
1067             name = 'OpenGraph %s' % prop[0]
1068         og_regexes = []
1069         for p in prop:
1070             og_regexes.extend(self._og_regexes(p))
1071         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1072         if escaped is None:
1073             return None
1074         return unescapeHTML(escaped)
1075
1076     def _og_search_thumbnail(self, html, **kargs):
1077         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1078
1079     def _og_search_description(self, html, **kargs):
1080         return self._og_search_property('description', html, fatal=False, **kargs)
1081
1082     def _og_search_title(self, html, **kargs):
1083         return self._og_search_property('title', html, **kargs)
1084
1085     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1086         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1087         if secure:
1088             regexes = self._og_regexes('video:secure_url') + regexes
1089         return self._html_search_regex(regexes, html, name, **kargs)
1090
1091     def _og_search_url(self, html, **kargs):
1092         return self._og_search_property('url', html, **kargs)
1093
1094     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1095         if not isinstance(name, (list, tuple)):
1096             name = [name]
1097         if display_name is None:
1098             display_name = name[0]
1099         return self._html_search_regex(
1100             [self._meta_regex(n) for n in name],
1101             html, display_name, fatal=fatal, group='content', **kwargs)
1102
1103     def _dc_search_uploader(self, html):
1104         return self._html_search_meta('dc.creator', html, 'uploader')
1105
1106     def _rta_search(self, html):
1107         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1108         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1109                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1110                      html):
1111             return 18
1112         return 0
1113
1114     def _media_rating_search(self, html):
1115         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1116         rating = self._html_search_meta('rating', html)
1117
1118         if not rating:
1119             return None
1120
1121         RATING_TABLE = {
1122             'safe for kids': 0,
1123             'general': 8,
1124             '14 years': 14,
1125             'mature': 17,
1126             'restricted': 19,
1127         }
1128         return RATING_TABLE.get(rating.lower())
1129
1130     def _family_friendly_search(self, html):
1131         # See http://schema.org/VideoObject
1132         family_friendly = self._html_search_meta(
1133             'isFamilyFriendly', html, default=None)
1134
1135         if not family_friendly:
1136             return None
1137
1138         RATING_TABLE = {
1139             '1': 0,
1140             'true': 0,
1141             '0': 18,
1142             'false': 18,
1143         }
1144         return RATING_TABLE.get(family_friendly.lower())
1145
1146     def _twitter_search_player(self, html):
1147         return self._html_search_meta('twitter:player', html,
1148                                       'twitter card player')
1149
1150     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1151         json_ld = self._search_regex(
1152             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1153             html, 'JSON-LD', group='json_ld', **kwargs)
1154         default = kwargs.get('default', NO_DEFAULT)
1155         if not json_ld:
1156             return default if default is not NO_DEFAULT else {}
1157         # JSON-LD may be malformed and thus `fatal` should be respected.
1158         # At the same time `default` may be passed that assumes `fatal=False`
1159         # for _search_regex. Let's simulate the same behavior here as well.
1160         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1161         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1162
1163     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1164         if isinstance(json_ld, compat_str):
1165             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1166         if not json_ld:
1167             return {}
1168         info = {}
1169         if not isinstance(json_ld, (list, tuple, dict)):
1170             return info
1171         if isinstance(json_ld, dict):
1172             json_ld = [json_ld]
1173
1174         INTERACTION_TYPE_MAP = {
1175             'CommentAction': 'comment',
1176             'AgreeAction': 'like',
1177             'DisagreeAction': 'dislike',
1178             'LikeAction': 'like',
1179             'DislikeAction': 'dislike',
1180             'ListenAction': 'view',
1181             'WatchAction': 'view',
1182             'ViewAction': 'view',
1183         }
1184
1185         def extract_interaction_statistic(e):
1186             interaction_statistic = e.get('interactionStatistic')
1187             if not isinstance(interaction_statistic, list):
1188                 return
1189             for is_e in interaction_statistic:
1190                 if not isinstance(is_e, dict):
1191                     continue
1192                 if is_e.get('@type') != 'InteractionCounter':
1193                     continue
1194                 interaction_type = is_e.get('interactionType')
1195                 if not isinstance(interaction_type, compat_str):
1196                     continue
1197                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1198                 if interaction_count is None:
1199                     continue
1200                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1201                 if not count_kind:
1202                     continue
1203                 count_key = '%s_count' % count_kind
1204                 if info.get(count_key) is not None:
1205                     continue
1206                 info[count_key] = interaction_count
1207
1208         def extract_video_object(e):
1209             assert e['@type'] == 'VideoObject'
1210             info.update({
1211                 'url': e.get('contentUrl'),
1212                 'title': unescapeHTML(e.get('name')),
1213                 'description': unescapeHTML(e.get('description')),
1214                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1215                 'duration': parse_duration(e.get('duration')),
1216                 'timestamp': unified_timestamp(e.get('uploadDate')),
1217                 'filesize': float_or_none(e.get('contentSize')),
1218                 'tbr': int_or_none(e.get('bitrate')),
1219                 'width': int_or_none(e.get('width')),
1220                 'height': int_or_none(e.get('height')),
1221                 'view_count': int_or_none(e.get('interactionCount')),
1222             })
1223             extract_interaction_statistic(e)
1224
1225         for e in json_ld:
1226             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1227                 item_type = e.get('@type')
1228                 if expected_type is not None and expected_type != item_type:
1229                     return info
1230                 if item_type in ('TVEpisode', 'Episode'):
1231                     info.update({
1232                         'episode': unescapeHTML(e.get('name')),
1233                         'episode_number': int_or_none(e.get('episodeNumber')),
1234                         'description': unescapeHTML(e.get('description')),
1235                     })
1236                     part_of_season = e.get('partOfSeason')
1237                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1238                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1239                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1240                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1241                         info['series'] = unescapeHTML(part_of_series.get('name'))
1242                 elif item_type in ('Article', 'NewsArticle'):
1243                     info.update({
1244                         'timestamp': parse_iso8601(e.get('datePublished')),
1245                         'title': unescapeHTML(e.get('headline')),
1246                         'description': unescapeHTML(e.get('articleBody')),
1247                     })
1248                 elif item_type == 'VideoObject':
1249                     extract_video_object(e)
1250                     continue
1251                 video = e.get('video')
1252                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1253                     extract_video_object(video)
1254                 break
1255         return dict((k, v) for k, v in info.items() if v is not None)
1256
1257     @staticmethod
1258     def _hidden_inputs(html):
1259         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1260         hidden_inputs = {}
1261         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1262             attrs = extract_attributes(input)
1263             if not input:
1264                 continue
1265             if attrs.get('type') not in ('hidden', 'submit'):
1266                 continue
1267             name = attrs.get('name') or attrs.get('id')
1268             value = attrs.get('value')
1269             if name and value is not None:
1270                 hidden_inputs[name] = value
1271         return hidden_inputs
1272
1273     def _form_hidden_inputs(self, form_id, html):
1274         form = self._search_regex(
1275             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1276             html, '%s form' % form_id, group='form')
1277         return self._hidden_inputs(form)
1278
1279     def _sort_formats(self, formats, field_preference=None):
1280         if not formats:
1281             raise ExtractorError('No video formats found')
1282
1283         for f in formats:
1284             # Automatically determine tbr when missing based on abr and vbr (improves
1285             # formats sorting in some cases)
1286             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1287                 f['tbr'] = f['abr'] + f['vbr']
1288
1289         def _formats_key(f):
1290             # TODO remove the following workaround
1291             from ..utils import determine_ext
1292             if not f.get('ext') and 'url' in f:
1293                 f['ext'] = determine_ext(f['url'])
1294
1295             if isinstance(field_preference, (list, tuple)):
1296                 return tuple(
1297                     f.get(field)
1298                     if f.get(field) is not None
1299                     else ('' if field == 'format_id' else -1)
1300                     for field in field_preference)
1301
1302             preference = f.get('preference')
1303             if preference is None:
1304                 preference = 0
1305                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1306                     preference -= 0.5
1307
1308             protocol = f.get('protocol') or determine_protocol(f)
1309             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1310
1311             if f.get('vcodec') == 'none':  # audio only
1312                 preference -= 50
1313                 if self._downloader.params.get('prefer_free_formats'):
1314                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1315                 else:
1316                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1317                 ext_preference = 0
1318                 try:
1319                     audio_ext_preference = ORDER.index(f['ext'])
1320                 except ValueError:
1321                     audio_ext_preference = -1
1322             else:
1323                 if f.get('acodec') == 'none':  # video only
1324                     preference -= 40
1325                 if self._downloader.params.get('prefer_free_formats'):
1326                     ORDER = ['flv', 'mp4', 'webm']
1327                 else:
1328                     ORDER = ['webm', 'flv', 'mp4']
1329                 try:
1330                     ext_preference = ORDER.index(f['ext'])
1331                 except ValueError:
1332                     ext_preference = -1
1333                 audio_ext_preference = 0
1334
1335             return (
1336                 preference,
1337                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1338                 f.get('quality') if f.get('quality') is not None else -1,
1339                 f.get('tbr') if f.get('tbr') is not None else -1,
1340                 f.get('filesize') if f.get('filesize') is not None else -1,
1341                 f.get('vbr') if f.get('vbr') is not None else -1,
1342                 f.get('height') if f.get('height') is not None else -1,
1343                 f.get('width') if f.get('width') is not None else -1,
1344                 proto_preference,
1345                 ext_preference,
1346                 f.get('abr') if f.get('abr') is not None else -1,
1347                 audio_ext_preference,
1348                 f.get('fps') if f.get('fps') is not None else -1,
1349                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1350                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1351                 f.get('format_id') if f.get('format_id') is not None else '',
1352             )
1353         formats.sort(key=_formats_key)
1354
1355     def _check_formats(self, formats, video_id):
1356         if formats:
1357             formats[:] = filter(
1358                 lambda f: self._is_valid_url(
1359                     f['url'], video_id,
1360                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1361                 formats)
1362
1363     @staticmethod
1364     def _remove_duplicate_formats(formats):
1365         format_urls = set()
1366         unique_formats = []
1367         for f in formats:
1368             if f['url'] not in format_urls:
1369                 format_urls.add(f['url'])
1370                 unique_formats.append(f)
1371         formats[:] = unique_formats
1372
1373     def _is_valid_url(self, url, video_id, item='video', headers={}):
1374         url = self._proto_relative_url(url, scheme='http:')
1375         # For now assume non HTTP(S) URLs always valid
1376         if not (url.startswith('http://') or url.startswith('https://')):
1377             return True
1378         try:
1379             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1380             return True
1381         except ExtractorError as e:
1382             if isinstance(e.cause, compat_urllib_error.URLError):
1383                 self.to_screen(
1384                     '%s: %s URL is invalid, skipping' % (video_id, item))
1385                 return False
1386             raise
1387
1388     def http_scheme(self):
1389         """ Either "http:" or "https:", depending on the user's preferences """
1390         return (
1391             'http:'
1392             if self._downloader.params.get('prefer_insecure', False)
1393             else 'https:')
1394
1395     def _proto_relative_url(self, url, scheme=None):
1396         if url is None:
1397             return url
1398         if url.startswith('//'):
1399             if scheme is None:
1400                 scheme = self.http_scheme()
1401             return scheme + url
1402         else:
1403             return url
1404
1405     def _sleep(self, timeout, video_id, msg_template=None):
1406         if msg_template is None:
1407             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1408         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1409         self.to_screen(msg)
1410         time.sleep(timeout)
1411
1412     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1413                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1414                              fatal=True, m3u8_id=None):
1415         manifest = self._download_xml(
1416             manifest_url, video_id, 'Downloading f4m manifest',
1417             'Unable to download f4m manifest',
1418             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1419             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1420             transform_source=transform_source,
1421             fatal=fatal)
1422
1423         if manifest is False:
1424             return []
1425
1426         return self._parse_f4m_formats(
1427             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1428             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1429
1430     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1431                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1432                            fatal=True, m3u8_id=None):
1433         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1434         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1435         if akamai_pv is not None and ';' in akamai_pv.text:
1436             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1437             if playerVerificationChallenge.strip() != '':
1438                 return []
1439
1440         formats = []
1441         manifest_version = '1.0'
1442         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1443         if not media_nodes:
1444             manifest_version = '2.0'
1445             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1446         # Remove unsupported DRM protected media from final formats
1447         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1448         media_nodes = remove_encrypted_media(media_nodes)
1449         if not media_nodes:
1450             return formats
1451
1452         manifest_base_url = get_base_url(manifest)
1453
1454         bootstrap_info = xpath_element(
1455             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1456             'bootstrap info', default=None)
1457
1458         vcodec = None
1459         mime_type = xpath_text(
1460             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1461             'base URL', default=None)
1462         if mime_type and mime_type.startswith('audio/'):
1463             vcodec = 'none'
1464
1465         for i, media_el in enumerate(media_nodes):
1466             tbr = int_or_none(media_el.attrib.get('bitrate'))
1467             width = int_or_none(media_el.attrib.get('width'))
1468             height = int_or_none(media_el.attrib.get('height'))
1469             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1470             # If <bootstrapInfo> is present, the specified f4m is a
1471             # stream-level manifest, and only set-level manifests may refer to
1472             # external resources.  See section 11.4 and section 4 of F4M spec
1473             if bootstrap_info is None:
1474                 media_url = None
1475                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1476                 if manifest_version == '2.0':
1477                     media_url = media_el.attrib.get('href')
1478                 if media_url is None:
1479                     media_url = media_el.attrib.get('url')
1480                 if not media_url:
1481                     continue
1482                 manifest_url = (
1483                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1484                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1485                 # If media_url is itself a f4m manifest do the recursive extraction
1486                 # since bitrates in parent manifest (this one) and media_url manifest
1487                 # may differ leading to inability to resolve the format by requested
1488                 # bitrate in f4m downloader
1489                 ext = determine_ext(manifest_url)
1490                 if ext == 'f4m':
1491                     f4m_formats = self._extract_f4m_formats(
1492                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1493                         transform_source=transform_source, fatal=fatal)
1494                     # Sometimes stream-level manifest contains single media entry that
1495                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1496                     # At the same time parent's media entry in set-level manifest may
1497                     # contain it. We will copy it from parent in such cases.
1498                     if len(f4m_formats) == 1:
1499                         f = f4m_formats[0]
1500                         f.update({
1501                             'tbr': f.get('tbr') or tbr,
1502                             'width': f.get('width') or width,
1503                             'height': f.get('height') or height,
1504                             'format_id': f.get('format_id') if not tbr else format_id,
1505                             'vcodec': vcodec,
1506                         })
1507                     formats.extend(f4m_formats)
1508                     continue
1509                 elif ext == 'm3u8':
1510                     formats.extend(self._extract_m3u8_formats(
1511                         manifest_url, video_id, 'mp4', preference=preference,
1512                         m3u8_id=m3u8_id, fatal=fatal))
1513                     continue
1514             formats.append({
1515                 'format_id': format_id,
1516                 'url': manifest_url,
1517                 'manifest_url': manifest_url,
1518                 'ext': 'flv' if bootstrap_info is not None else None,
1519                 'protocol': 'f4m',
1520                 'tbr': tbr,
1521                 'width': width,
1522                 'height': height,
1523                 'vcodec': vcodec,
1524                 'preference': preference,
1525             })
1526         return formats
1527
1528     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1529         return {
1530             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1531             'url': m3u8_url,
1532             'ext': ext,
1533             'protocol': 'm3u8',
1534             'preference': preference - 100 if preference else -100,
1535             'resolution': 'multiple',
1536             'format_note': 'Quality selection URL',
1537         }
1538
1539     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1540                               entry_protocol='m3u8', preference=None,
1541                               m3u8_id=None, note=None, errnote=None,
1542                               fatal=True, live=False):
1543         res = self._download_webpage_handle(
1544             m3u8_url, video_id,
1545             note=note or 'Downloading m3u8 information',
1546             errnote=errnote or 'Failed to download m3u8 information',
1547             fatal=fatal)
1548
1549         if res is False:
1550             return []
1551
1552         m3u8_doc, urlh = res
1553         m3u8_url = urlh.geturl()
1554
1555         return self._parse_m3u8_formats(
1556             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1557             preference=preference, m3u8_id=m3u8_id, live=live)
1558
1559     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1560                             entry_protocol='m3u8', preference=None,
1561                             m3u8_id=None, live=False):
1562         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1563             return []
1564
1565         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1566             return []
1567
1568         formats = []
1569
1570         format_url = lambda u: (
1571             u
1572             if re.match(r'^https?://', u)
1573             else compat_urlparse.urljoin(m3u8_url, u))
1574
1575         # References:
1576         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1577         # 2. https://github.com/rg3/youtube-dl/issues/12211
1578
1579         # We should try extracting formats only from master playlists [1, 4.3.4],
1580         # i.e. playlists that describe available qualities. On the other hand
1581         # media playlists [1, 4.3.3] should be returned as is since they contain
1582         # just the media without qualities renditions.
1583         # Fortunately, master playlist can be easily distinguished from media
1584         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1585         # master playlist tags MUST NOT appear in a media playist and vice versa.
1586         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1587         # media playlist and MUST NOT appear in master playlist thus we can
1588         # clearly detect media playlist with this criterion.
1589
1590         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1591             return [{
1592                 'url': m3u8_url,
1593                 'format_id': m3u8_id,
1594                 'ext': ext,
1595                 'protocol': entry_protocol,
1596                 'preference': preference,
1597             }]
1598
1599         groups = {}
1600         last_stream_inf = {}
1601
1602         def extract_media(x_media_line):
1603             media = parse_m3u8_attributes(x_media_line)
1604             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1605             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1606             if not (media_type and group_id and name):
1607                 return
1608             groups.setdefault(group_id, []).append(media)
1609             if media_type not in ('VIDEO', 'AUDIO'):
1610                 return
1611             media_url = media.get('URI')
1612             if media_url:
1613                 format_id = []
1614                 for v in (m3u8_id, group_id, name):
1615                     if v:
1616                         format_id.append(v)
1617                 f = {
1618                     'format_id': '-'.join(format_id),
1619                     'url': format_url(media_url),
1620                     'manifest_url': m3u8_url,
1621                     'language': media.get('LANGUAGE'),
1622                     'ext': ext,
1623                     'protocol': entry_protocol,
1624                     'preference': preference,
1625                 }
1626                 if media_type == 'AUDIO':
1627                     f['vcodec'] = 'none'
1628                 formats.append(f)
1629
1630         def build_stream_name():
1631             # Despite specification does not mention NAME attribute for
1632             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1633             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1634             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1635             stream_name = last_stream_inf.get('NAME')
1636             if stream_name:
1637                 return stream_name
1638             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1639             # from corresponding rendition group
1640             stream_group_id = last_stream_inf.get('VIDEO')
1641             if not stream_group_id:
1642                 return
1643             stream_group = groups.get(stream_group_id)
1644             if not stream_group:
1645                 return stream_group_id
1646             rendition = stream_group[0]
1647             return rendition.get('NAME') or stream_group_id
1648
1649         for line in m3u8_doc.splitlines():
1650             if line.startswith('#EXT-X-STREAM-INF:'):
1651                 last_stream_inf = parse_m3u8_attributes(line)
1652             elif line.startswith('#EXT-X-MEDIA:'):
1653                 extract_media(line)
1654             elif line.startswith('#') or not line.strip():
1655                 continue
1656             else:
1657                 tbr = float_or_none(
1658                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1659                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1660                 format_id = []
1661                 if m3u8_id:
1662                     format_id.append(m3u8_id)
1663                 stream_name = build_stream_name()
1664                 # Bandwidth of live streams may differ over time thus making
1665                 # format_id unpredictable. So it's better to keep provided
1666                 # format_id intact.
1667                 if not live:
1668                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1669                 manifest_url = format_url(line.strip())
1670                 f = {
1671                     'format_id': '-'.join(format_id),
1672                     'url': manifest_url,
1673                     'manifest_url': m3u8_url,
1674                     'tbr': tbr,
1675                     'ext': ext,
1676                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1677                     'protocol': entry_protocol,
1678                     'preference': preference,
1679                 }
1680                 resolution = last_stream_inf.get('RESOLUTION')
1681                 if resolution:
1682                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1683                     if mobj:
1684                         f['width'] = int(mobj.group('width'))
1685                         f['height'] = int(mobj.group('height'))
1686                 # Unified Streaming Platform
1687                 mobj = re.search(
1688                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1689                 if mobj:
1690                     abr, vbr = mobj.groups()
1691                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1692                     f.update({
1693                         'vbr': vbr,
1694                         'abr': abr,
1695                     })
1696                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1697                 f.update(codecs)
1698                 audio_group_id = last_stream_inf.get('AUDIO')
1699                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1700                 # references a rendition group MUST have a CODECS attribute.
1701                 # However, this is not always respected, for example, [2]
1702                 # contains EXT-X-STREAM-INF tag which references AUDIO
1703                 # rendition group but does not have CODECS and despite
1704                 # referencing audio group an audio group, it represents
1705                 # a complete (with audio and video) format. So, for such cases
1706                 # we will ignore references to rendition groups and treat them
1707                 # as complete formats.
1708                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1709                     audio_group = groups.get(audio_group_id)
1710                     if audio_group and audio_group[0].get('URI'):
1711                         # TODO: update acodec for audio only formats with
1712                         # the same GROUP-ID
1713                         f['acodec'] = 'none'
1714                 formats.append(f)
1715                 last_stream_inf = {}
1716         return formats
1717
1718     @staticmethod
1719     def _xpath_ns(path, namespace=None):
1720         if not namespace:
1721             return path
1722         out = []
1723         for c in path.split('/'):
1724             if not c or c == '.':
1725                 out.append(c)
1726             else:
1727                 out.append('{%s}%s' % (namespace, c))
1728         return '/'.join(out)
1729
1730     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1731         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1732
1733         if smil is False:
1734             assert not fatal
1735             return []
1736
1737         namespace = self._parse_smil_namespace(smil)
1738
1739         return self._parse_smil_formats(
1740             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1741
1742     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1743         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1744         if smil is False:
1745             return {}
1746         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1747
1748     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1749         return self._download_xml(
1750             smil_url, video_id, 'Downloading SMIL file',
1751             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1752
1753     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1754         namespace = self._parse_smil_namespace(smil)
1755
1756         formats = self._parse_smil_formats(
1757             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1758         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1759
1760         video_id = os.path.splitext(url_basename(smil_url))[0]
1761         title = None
1762         description = None
1763         upload_date = None
1764         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1765             name = meta.attrib.get('name')
1766             content = meta.attrib.get('content')
1767             if not name or not content:
1768                 continue
1769             if not title and name == 'title':
1770                 title = content
1771             elif not description and name in ('description', 'abstract'):
1772                 description = content
1773             elif not upload_date and name == 'date':
1774                 upload_date = unified_strdate(content)
1775
1776         thumbnails = [{
1777             'id': image.get('type'),
1778             'url': image.get('src'),
1779             'width': int_or_none(image.get('width')),
1780             'height': int_or_none(image.get('height')),
1781         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1782
1783         return {
1784             'id': video_id,
1785             'title': title or video_id,
1786             'description': description,
1787             'upload_date': upload_date,
1788             'thumbnails': thumbnails,
1789             'formats': formats,
1790             'subtitles': subtitles,
1791         }
1792
1793     def _parse_smil_namespace(self, smil):
1794         return self._search_regex(
1795             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1796
1797     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1798         base = smil_url
1799         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1800             b = meta.get('base') or meta.get('httpBase')
1801             if b:
1802                 base = b
1803                 break
1804
1805         formats = []
1806         rtmp_count = 0
1807         http_count = 0
1808         m3u8_count = 0
1809
1810         srcs = []
1811         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1812         for medium in media:
1813             src = medium.get('src')
1814             if not src or src in srcs:
1815                 continue
1816             srcs.append(src)
1817
1818             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1819             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1820             width = int_or_none(medium.get('width'))
1821             height = int_or_none(medium.get('height'))
1822             proto = medium.get('proto')
1823             ext = medium.get('ext')
1824             src_ext = determine_ext(src)
1825             streamer = medium.get('streamer') or base
1826
1827             if proto == 'rtmp' or streamer.startswith('rtmp'):
1828                 rtmp_count += 1
1829                 formats.append({
1830                     'url': streamer,
1831                     'play_path': src,
1832                     'ext': 'flv',
1833                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1834                     'tbr': bitrate,
1835                     'filesize': filesize,
1836                     'width': width,
1837                     'height': height,
1838                 })
1839                 if transform_rtmp_url:
1840                     streamer, src = transform_rtmp_url(streamer, src)
1841                     formats[-1].update({
1842                         'url': streamer,
1843                         'play_path': src,
1844                     })
1845                 continue
1846
1847             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1848             src_url = src_url.strip()
1849
1850             if proto == 'm3u8' or src_ext == 'm3u8':
1851                 m3u8_formats = self._extract_m3u8_formats(
1852                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1853                 if len(m3u8_formats) == 1:
1854                     m3u8_count += 1
1855                     m3u8_formats[0].update({
1856                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1857                         'tbr': bitrate,
1858                         'width': width,
1859                         'height': height,
1860                     })
1861                 formats.extend(m3u8_formats)
1862                 continue
1863
1864             if src_ext == 'f4m':
1865                 f4m_url = src_url
1866                 if not f4m_params:
1867                     f4m_params = {
1868                         'hdcore': '3.2.0',
1869                         'plugin': 'flowplayer-3.2.0.1',
1870                     }
1871                 f4m_url += '&' if '?' in f4m_url else '?'
1872                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1873                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1874                 continue
1875
1876             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1877                 http_count += 1
1878                 formats.append({
1879                     'url': src_url,
1880                     'ext': ext or src_ext or 'flv',
1881                     'format_id': 'http-%d' % (bitrate or http_count),
1882                     'tbr': bitrate,
1883                     'filesize': filesize,
1884                     'width': width,
1885                     'height': height,
1886                 })
1887                 continue
1888
1889         return formats
1890
1891     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1892         urls = []
1893         subtitles = {}
1894         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1895             src = textstream.get('src')
1896             if not src or src in urls:
1897                 continue
1898             urls.append(src)
1899             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1900             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1901             subtitles.setdefault(lang, []).append({
1902                 'url': src,
1903                 'ext': ext,
1904             })
1905         return subtitles
1906
1907     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1908         xspf = self._download_xml(
1909             xspf_url, playlist_id, 'Downloading xpsf playlist',
1910             'Unable to download xspf manifest', fatal=fatal)
1911         if xspf is False:
1912             return []
1913         return self._parse_xspf(
1914             xspf, playlist_id, xspf_url=xspf_url,
1915             xspf_base_url=base_url(xspf_url))
1916
1917     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1918         NS_MAP = {
1919             'xspf': 'http://xspf.org/ns/0/',
1920             's1': 'http://static.streamone.nl/player/ns/0',
1921         }
1922
1923         entries = []
1924         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1925             title = xpath_text(
1926                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1927             description = xpath_text(
1928                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1929             thumbnail = xpath_text(
1930                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1931             duration = float_or_none(
1932                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1933
1934             formats = []
1935             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1936                 format_url = urljoin(xspf_base_url, location.text)
1937                 if not format_url:
1938                     continue
1939                 formats.append({
1940                     'url': format_url,
1941                     'manifest_url': xspf_url,
1942                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1943                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1944                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1945                 })
1946             self._sort_formats(formats)
1947
1948             entries.append({
1949                 'id': playlist_id,
1950                 'title': title,
1951                 'description': description,
1952                 'thumbnail': thumbnail,
1953                 'duration': duration,
1954                 'formats': formats,
1955             })
1956         return entries
1957
1958     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1959         res = self._download_xml_handle(
1960             mpd_url, video_id,
1961             note=note or 'Downloading MPD manifest',
1962             errnote=errnote or 'Failed to download MPD manifest',
1963             fatal=fatal)
1964         if res is False:
1965             return []
1966         mpd_doc, urlh = res
1967         mpd_base_url = base_url(urlh.geturl())
1968
1969         return self._parse_mpd_formats(
1970             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1971             formats_dict=formats_dict, mpd_url=mpd_url)
1972
1973     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1974         """
1975         Parse formats from MPD manifest.
1976         References:
1977          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1978             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1979          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1980         """
1981         if mpd_doc.get('type') == 'dynamic':
1982             return []
1983
1984         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1985
1986         def _add_ns(path):
1987             return self._xpath_ns(path, namespace)
1988
1989         def is_drm_protected(element):
1990             return element.find(_add_ns('ContentProtection')) is not None
1991
1992         def extract_multisegment_info(element, ms_parent_info):
1993             ms_info = ms_parent_info.copy()
1994
1995             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1996             # common attributes and elements.  We will only extract relevant
1997             # for us.
1998             def extract_common(source):
1999                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2000                 if segment_timeline is not None:
2001                     s_e = segment_timeline.findall(_add_ns('S'))
2002                     if s_e:
2003                         ms_info['total_number'] = 0
2004                         ms_info['s'] = []
2005                         for s in s_e:
2006                             r = int(s.get('r', 0))
2007                             ms_info['total_number'] += 1 + r
2008                             ms_info['s'].append({
2009                                 't': int(s.get('t', 0)),
2010                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2011                                 'd': int(s.attrib['d']),
2012                                 'r': r,
2013                             })
2014                 start_number = source.get('startNumber')
2015                 if start_number:
2016                     ms_info['start_number'] = int(start_number)
2017                 timescale = source.get('timescale')
2018                 if timescale:
2019                     ms_info['timescale'] = int(timescale)
2020                 segment_duration = source.get('duration')
2021                 if segment_duration:
2022                     ms_info['segment_duration'] = float(segment_duration)
2023
2024             def extract_Initialization(source):
2025                 initialization = source.find(_add_ns('Initialization'))
2026                 if initialization is not None:
2027                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2028
2029             segment_list = element.find(_add_ns('SegmentList'))
2030             if segment_list is not None:
2031                 extract_common(segment_list)
2032                 extract_Initialization(segment_list)
2033                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2034                 if segment_urls_e:
2035                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2036             else:
2037                 segment_template = element.find(_add_ns('SegmentTemplate'))
2038                 if segment_template is not None:
2039                     extract_common(segment_template)
2040                     media = segment_template.get('media')
2041                     if media:
2042                         ms_info['media'] = media
2043                     initialization = segment_template.get('initialization')
2044                     if initialization:
2045                         ms_info['initialization'] = initialization
2046                     else:
2047                         extract_Initialization(segment_template)
2048             return ms_info
2049
2050         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2051         formats = []
2052         for period in mpd_doc.findall(_add_ns('Period')):
2053             period_duration = parse_duration(period.get('duration')) or mpd_duration
2054             period_ms_info = extract_multisegment_info(period, {
2055                 'start_number': 1,
2056                 'timescale': 1,
2057             })
2058             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2059                 if is_drm_protected(adaptation_set):
2060                     continue
2061                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2062                 for representation in adaptation_set.findall(_add_ns('Representation')):
2063                     if is_drm_protected(representation):
2064                         continue
2065                     representation_attrib = adaptation_set.attrib.copy()
2066                     representation_attrib.update(representation.attrib)
2067                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2068                     mime_type = representation_attrib['mimeType']
2069                     content_type = mime_type.split('/')[0]
2070                     if content_type == 'text':
2071                         # TODO implement WebVTT downloading
2072                         pass
2073                     elif content_type in ('video', 'audio'):
2074                         base_url = ''
2075                         for element in (representation, adaptation_set, period, mpd_doc):
2076                             base_url_e = element.find(_add_ns('BaseURL'))
2077                             if base_url_e is not None:
2078                                 base_url = base_url_e.text + base_url
2079                                 if re.match(r'^https?://', base_url):
2080                                     break
2081                         if mpd_base_url and not re.match(r'^https?://', base_url):
2082                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2083                                 mpd_base_url += '/'
2084                             base_url = mpd_base_url + base_url
2085                         representation_id = representation_attrib.get('id')
2086                         lang = representation_attrib.get('lang')
2087                         url_el = representation.find(_add_ns('BaseURL'))
2088                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2089                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2090                         f = {
2091                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2092                             'url': base_url,
2093                             'manifest_url': mpd_url,
2094                             'ext': mimetype2ext(mime_type),
2095                             'width': int_or_none(representation_attrib.get('width')),
2096                             'height': int_or_none(representation_attrib.get('height')),
2097                             'tbr': float_or_none(bandwidth, 1000),
2098                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2099                             'fps': int_or_none(representation_attrib.get('frameRate')),
2100                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2101                             'format_note': 'DASH %s' % content_type,
2102                             'filesize': filesize,
2103                             'container': mimetype2ext(mime_type) + '_dash',
2104                         }
2105                         f.update(parse_codecs(representation_attrib.get('codecs')))
2106                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2107
2108                         def prepare_template(template_name, identifiers):
2109                             tmpl = representation_ms_info[template_name]
2110                             # First of, % characters outside $...$ templates
2111                             # must be escaped by doubling for proper processing
2112                             # by % operator string formatting used further (see
2113                             # https://github.com/rg3/youtube-dl/issues/16867).
2114                             t = ''
2115                             in_template = False
2116                             for c in tmpl:
2117                                 t += c
2118                                 if c == '$':
2119                                     in_template = not in_template
2120                                 elif c == '%' and not in_template:
2121                                     t += c
2122                             # Next, $...$ templates are translated to their
2123                             # %(...) counterparts to be used with % operator
2124                             t = t.replace('$RepresentationID$', representation_id)
2125                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2126                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2127                             t.replace('$$', '$')
2128                             return t
2129
2130                         # @initialization is a regular template like @media one
2131                         # so it should be handled just the same way (see
2132                         # https://github.com/rg3/youtube-dl/issues/11605)
2133                         if 'initialization' in representation_ms_info:
2134                             initialization_template = prepare_template(
2135                                 'initialization',
2136                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2137                                 # $Time$ shall not be included for @initialization thus
2138                                 # only $Bandwidth$ remains
2139                                 ('Bandwidth', ))
2140                             representation_ms_info['initialization_url'] = initialization_template % {
2141                                 'Bandwidth': bandwidth,
2142                             }
2143
2144                         def location_key(location):
2145                             return 'url' if re.match(r'^https?://', location) else 'path'
2146
2147                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2148
2149                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2150                             media_location_key = location_key(media_template)
2151
2152                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2153                             # can't be used at the same time
2154                             if '%(Number' in media_template and 's' not in representation_ms_info:
2155                                 segment_duration = None
2156                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2157                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2158                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2159                                 representation_ms_info['fragments'] = [{
2160                                     media_location_key: media_template % {
2161                                         'Number': segment_number,
2162                                         'Bandwidth': bandwidth,
2163                                     },
2164                                     'duration': segment_duration,
2165                                 } for segment_number in range(
2166                                     representation_ms_info['start_number'],
2167                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2168                             else:
2169                                 # $Number*$ or $Time$ in media template with S list available
2170                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2171                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2172                                 representation_ms_info['fragments'] = []
2173                                 segment_time = 0
2174                                 segment_d = None
2175                                 segment_number = representation_ms_info['start_number']
2176
2177                                 def add_segment_url():
2178                                     segment_url = media_template % {
2179                                         'Time': segment_time,
2180                                         'Bandwidth': bandwidth,
2181                                         'Number': segment_number,
2182                                     }
2183                                     representation_ms_info['fragments'].append({
2184                                         media_location_key: segment_url,
2185                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2186                                     })
2187
2188                                 for num, s in enumerate(representation_ms_info['s']):
2189                                     segment_time = s.get('t') or segment_time
2190                                     segment_d = s['d']
2191                                     add_segment_url()
2192                                     segment_number += 1
2193                                     for r in range(s.get('r', 0)):
2194                                         segment_time += segment_d
2195                                         add_segment_url()
2196                                         segment_number += 1
2197                                     segment_time += segment_d
2198                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2199                             # No media template
2200                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2201                             # or any YouTube dashsegments video
2202                             fragments = []
2203                             segment_index = 0
2204                             timescale = representation_ms_info['timescale']
2205                             for s in representation_ms_info['s']:
2206                                 duration = float_or_none(s['d'], timescale)
2207                                 for r in range(s.get('r', 0) + 1):
2208                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2209                                     fragments.append({
2210                                         location_key(segment_uri): segment_uri,
2211                                         'duration': duration,
2212                                     })
2213                                     segment_index += 1
2214                             representation_ms_info['fragments'] = fragments
2215                         elif 'segment_urls' in representation_ms_info:
2216                             # Segment URLs with no SegmentTimeline
2217                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2218                             # https://github.com/rg3/youtube-dl/pull/14844
2219                             fragments = []
2220                             segment_duration = float_or_none(
2221                                 representation_ms_info['segment_duration'],
2222                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2223                             for segment_url in representation_ms_info['segment_urls']:
2224                                 fragment = {
2225                                     location_key(segment_url): segment_url,
2226                                 }
2227                                 if segment_duration:
2228                                     fragment['duration'] = segment_duration
2229                                 fragments.append(fragment)
2230                             representation_ms_info['fragments'] = fragments
2231                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2232                         # No fragments key is present in this case.
2233                         if 'fragments' in representation_ms_info:
2234                             f.update({
2235                                 'fragment_base_url': base_url,
2236                                 'fragments': [],
2237                                 'protocol': 'http_dash_segments',
2238                             })
2239                             if 'initialization_url' in representation_ms_info:
2240                                 initialization_url = representation_ms_info['initialization_url']
2241                                 if not f.get('url'):
2242                                     f['url'] = initialization_url
2243                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2244                             f['fragments'].extend(representation_ms_info['fragments'])
2245                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2246                         # is not necessarily unique within a Period thus formats with
2247                         # the same `format_id` are quite possible. There are numerous examples
2248                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2249                         # https://github.com/rg3/youtube-dl/issues/13919)
2250                         full_info = formats_dict.get(representation_id, {}).copy()
2251                         full_info.update(f)
2252                         formats.append(full_info)
2253                     else:
2254                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2255         return formats
2256
2257     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2258         res = self._download_xml_handle(
2259             ism_url, video_id,
2260             note=note or 'Downloading ISM manifest',
2261             errnote=errnote or 'Failed to download ISM manifest',
2262             fatal=fatal)
2263         if res is False:
2264             return []
2265         ism_doc, urlh = res
2266
2267         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2268
2269     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2270         """
2271         Parse formats from ISM manifest.
2272         References:
2273          1. [MS-SSTR]: Smooth Streaming Protocol,
2274             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2275         """
2276         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2277             return []
2278
2279         duration = int(ism_doc.attrib['Duration'])
2280         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2281
2282         formats = []
2283         for stream in ism_doc.findall('StreamIndex'):
2284             stream_type = stream.get('Type')
2285             if stream_type not in ('video', 'audio'):
2286                 continue
2287             url_pattern = stream.attrib['Url']
2288             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2289             stream_name = stream.get('Name')
2290             for track in stream.findall('QualityLevel'):
2291                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2292                 # TODO: add support for WVC1 and WMAP
2293                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2294                     self.report_warning('%s is not a supported codec' % fourcc)
2295                     continue
2296                 tbr = int(track.attrib['Bitrate']) // 1000
2297                 # [1] does not mention Width and Height attributes. However,
2298                 # they're often present while MaxWidth and MaxHeight are
2299                 # missing, so should be used as fallbacks
2300                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2301                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2302                 sampling_rate = int_or_none(track.get('SamplingRate'))
2303
2304                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2305                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2306
2307                 fragments = []
2308                 fragment_ctx = {
2309                     'time': 0,
2310                 }
2311                 stream_fragments = stream.findall('c')
2312                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2313                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2314                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2315                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2316                     if not fragment_ctx['duration']:
2317                         try:
2318                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2319                         except IndexError:
2320                             next_fragment_time = duration
2321                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2322                     for _ in range(fragment_repeat):
2323                         fragments.append({
2324                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2325                             'duration': fragment_ctx['duration'] / stream_timescale,
2326                         })
2327                         fragment_ctx['time'] += fragment_ctx['duration']
2328
2329                 format_id = []
2330                 if ism_id:
2331                     format_id.append(ism_id)
2332                 if stream_name:
2333                     format_id.append(stream_name)
2334                 format_id.append(compat_str(tbr))
2335
2336                 formats.append({
2337                     'format_id': '-'.join(format_id),
2338                     'url': ism_url,
2339                     'manifest_url': ism_url,
2340                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2341                     'width': width,
2342                     'height': height,
2343                     'tbr': tbr,
2344                     'asr': sampling_rate,
2345                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2346                     'acodec': 'none' if stream_type == 'video' else fourcc,
2347                     'protocol': 'ism',
2348                     'fragments': fragments,
2349                     '_download_params': {
2350                         'duration': duration,
2351                         'timescale': stream_timescale,
2352                         'width': width or 0,
2353                         'height': height or 0,
2354                         'fourcc': fourcc,
2355                         'codec_private_data': track.get('CodecPrivateData'),
2356                         'sampling_rate': sampling_rate,
2357                         'channels': int_or_none(track.get('Channels', 2)),
2358                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2359                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2360                     },
2361                 })
2362         return formats
2363
2364     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2365         def absolute_url(item_url):
2366             return urljoin(base_url, item_url)
2367
2368         def parse_content_type(content_type):
2369             if not content_type:
2370                 return {}
2371             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2372             if ctr:
2373                 mimetype, codecs = ctr.groups()
2374                 f = parse_codecs(codecs)
2375                 f['ext'] = mimetype2ext(mimetype)
2376                 return f
2377             return {}
2378
2379         def _media_formats(src, cur_media_type, type_info={}):
2380             full_url = absolute_url(src)
2381             ext = type_info.get('ext') or determine_ext(full_url)
2382             if ext == 'm3u8':
2383                 is_plain_url = False
2384                 formats = self._extract_m3u8_formats(
2385                     full_url, video_id, ext='mp4',
2386                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2387                     preference=preference, fatal=False)
2388             elif ext == 'mpd':
2389                 is_plain_url = False
2390                 formats = self._extract_mpd_formats(
2391                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2392             else:
2393                 is_plain_url = True
2394                 formats = [{
2395                     'url': full_url,
2396                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2397                 }]
2398             return is_plain_url, formats
2399
2400         entries = []
2401         # amp-video and amp-audio are very similar to their HTML5 counterparts
2402         # so we wll include them right here (see
2403         # https://www.ampproject.org/docs/reference/components/amp-video)
2404         media_tags = [(media_tag, media_type, '')
2405                       for media_tag, media_type
2406                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2407         media_tags.extend(re.findall(
2408             # We only allow video|audio followed by a whitespace or '>'.
2409             # Allowing more characters may end up in significant slow down (see
2410             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2411             # http://www.porntrex.com/maps/videositemap.xml).
2412             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2413         for media_tag, media_type, media_content in media_tags:
2414             media_info = {
2415                 'formats': [],
2416                 'subtitles': {},
2417             }
2418             media_attributes = extract_attributes(media_tag)
2419             src = media_attributes.get('src')
2420             if src:
2421                 _, formats = _media_formats(src, media_type)
2422                 media_info['formats'].extend(formats)
2423             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2424             if media_content:
2425                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2426                     source_attributes = extract_attributes(source_tag)
2427                     src = source_attributes.get('src')
2428                     if not src:
2429                         continue
2430                     f = parse_content_type(source_attributes.get('type'))
2431                     is_plain_url, formats = _media_formats(src, media_type, f)
2432                     if is_plain_url:
2433                         # res attribute is not standard but seen several times
2434                         # in the wild
2435                         f.update({
2436                             'height': int_or_none(source_attributes.get('res')),
2437                             'format_id': source_attributes.get('label'),
2438                         })
2439                         f.update(formats[0])
2440                         media_info['formats'].append(f)
2441                     else:
2442                         media_info['formats'].extend(formats)
2443                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2444                     track_attributes = extract_attributes(track_tag)
2445                     kind = track_attributes.get('kind')
2446                     if not kind or kind in ('subtitles', 'captions'):
2447                         src = track_attributes.get('src')
2448                         if not src:
2449                             continue
2450                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2451                         media_info['subtitles'].setdefault(lang, []).append({
2452                             'url': absolute_url(src),
2453                         })
2454             for f in media_info['formats']:
2455                 f.setdefault('http_headers', {})['Referer'] = base_url
2456             if media_info['formats'] or media_info['subtitles']:
2457                 entries.append(media_info)
2458         return entries
2459
2460     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2461         formats = []
2462         hdcore_sign = 'hdcore=3.7.0'
2463         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2464         hds_host = hosts.get('hds')
2465         if hds_host:
2466             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2467         if 'hdcore=' not in f4m_url:
2468             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2469         f4m_formats = self._extract_f4m_formats(
2470             f4m_url, video_id, f4m_id='hds', fatal=False)
2471         for entry in f4m_formats:
2472             entry.update({'extra_param_to_segment_url': hdcore_sign})
2473         formats.extend(f4m_formats)
2474         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2475         hls_host = hosts.get('hls')
2476         if hls_host:
2477             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2478         formats.extend(self._extract_m3u8_formats(
2479             m3u8_url, video_id, 'mp4', 'm3u8_native',
2480             m3u8_id='hls', fatal=False))
2481         return formats
2482
2483     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2484         query = compat_urlparse.urlparse(url).query
2485         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2486         mobj = re.search(
2487             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2488         url_base = mobj.group('url')
2489         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2490         formats = []
2491
2492         def manifest_url(manifest):
2493             m_url = '%s/%s' % (http_base_url, manifest)
2494             if query:
2495                 m_url += '?%s' % query
2496             return m_url
2497
2498         if 'm3u8' not in skip_protocols:
2499             formats.extend(self._extract_m3u8_formats(
2500                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2501                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2502         if 'f4m' not in skip_protocols:
2503             formats.extend(self._extract_f4m_formats(
2504                 manifest_url('manifest.f4m'),
2505                 video_id, f4m_id='hds', fatal=False))
2506         if 'dash' not in skip_protocols:
2507             formats.extend(self._extract_mpd_formats(
2508                 manifest_url('manifest.mpd'),
2509                 video_id, mpd_id='dash', fatal=False))
2510         if re.search(r'(?:/smil:|\.smil)', url_base):
2511             if 'smil' not in skip_protocols:
2512                 rtmp_formats = self._extract_smil_formats(
2513                     manifest_url('jwplayer.smil'),
2514                     video_id, fatal=False)
2515                 for rtmp_format in rtmp_formats:
2516                     rtsp_format = rtmp_format.copy()
2517                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2518                     del rtsp_format['play_path']
2519                     del rtsp_format['ext']
2520                     rtsp_format.update({
2521                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2522                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2523                         'protocol': 'rtsp',
2524                     })
2525                     formats.extend([rtmp_format, rtsp_format])
2526         else:
2527             for protocol in ('rtmp', 'rtsp'):
2528                 if protocol not in skip_protocols:
2529                     formats.append({
2530                         'url': '%s:%s' % (protocol, url_base),
2531                         'format_id': protocol,
2532                         'protocol': protocol,
2533                     })
2534         return formats
2535
2536     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2537         mobj = re.search(
2538             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2539             webpage)
2540         if mobj:
2541             try:
2542                 jwplayer_data = self._parse_json(mobj.group('options'),
2543                                                  video_id=video_id,
2544                                                  transform_source=transform_source)
2545             except ExtractorError:
2546                 pass
2547             else:
2548                 if isinstance(jwplayer_data, dict):
2549                     return jwplayer_data
2550
2551     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2552         jwplayer_data = self._find_jwplayer_data(
2553             webpage, video_id, transform_source=js_to_json)
2554         return self._parse_jwplayer_data(
2555             jwplayer_data, video_id, *args, **kwargs)
2556
2557     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2558                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2559         # JWPlayer backward compatibility: flattened playlists
2560         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2561         if 'playlist' not in jwplayer_data:
2562             jwplayer_data = {'playlist': [jwplayer_data]}
2563
2564         entries = []
2565
2566         # JWPlayer backward compatibility: single playlist item
2567         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2568         if not isinstance(jwplayer_data['playlist'], list):
2569             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2570
2571         for video_data in jwplayer_data['playlist']:
2572             # JWPlayer backward compatibility: flattened sources
2573             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2574             if 'sources' not in video_data:
2575                 video_data['sources'] = [video_data]
2576
2577             this_video_id = video_id or video_data['mediaid']
2578
2579             formats = self._parse_jwplayer_formats(
2580                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2581                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2582
2583             subtitles = {}
2584             tracks = video_data.get('tracks')
2585             if tracks and isinstance(tracks, list):
2586                 for track in tracks:
2587                     if not isinstance(track, dict):
2588                         continue
2589                     track_kind = track.get('kind')
2590                     if not track_kind or not isinstance(track_kind, compat_str):
2591                         continue
2592                     if track_kind.lower() not in ('captions', 'subtitles'):
2593                         continue
2594                     track_url = urljoin(base_url, track.get('file'))
2595                     if not track_url:
2596                         continue
2597                     subtitles.setdefault(track.get('label') or 'en', []).append({
2598                         'url': self._proto_relative_url(track_url)
2599                     })
2600
2601             entry = {
2602                 'id': this_video_id,
2603                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2604                 'description': video_data.get('description'),
2605                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2606                 'timestamp': int_or_none(video_data.get('pubdate')),
2607                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2608                 'subtitles': subtitles,
2609             }
2610             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2611             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2612                 entry.update({
2613                     '_type': 'url_transparent',
2614                     'url': formats[0]['url'],
2615                 })
2616             else:
2617                 self._sort_formats(formats)
2618                 entry['formats'] = formats
2619             entries.append(entry)
2620         if len(entries) == 1:
2621             return entries[0]
2622         else:
2623             return self.playlist_result(entries)
2624
2625     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2626                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2627         urls = []
2628         formats = []
2629         for source in jwplayer_sources_data:
2630             if not isinstance(source, dict):
2631                 continue
2632             source_url = self._proto_relative_url(source.get('file'))
2633             if not source_url:
2634                 continue
2635             if base_url:
2636                 source_url = compat_urlparse.urljoin(base_url, source_url)
2637             if source_url in urls:
2638                 continue
2639             urls.append(source_url)
2640             source_type = source.get('type') or ''
2641             ext = mimetype2ext(source_type) or determine_ext(source_url)
2642             if source_type == 'hls' or ext == 'm3u8':
2643                 formats.extend(self._extract_m3u8_formats(
2644                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2645                     m3u8_id=m3u8_id, fatal=False))
2646             elif source_type == 'dash' or ext == 'mpd':
2647                 formats.extend(self._extract_mpd_formats(
2648                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2649             elif ext == 'smil':
2650                 formats.extend(self._extract_smil_formats(
2651                     source_url, video_id, fatal=False))
2652             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2653             elif source_type.startswith('audio') or ext in (
2654                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2655                 formats.append({
2656                     'url': source_url,
2657                     'vcodec': 'none',
2658                     'ext': ext,
2659                 })
2660             else:
2661                 height = int_or_none(source.get('height'))
2662                 if height is None:
2663                     # Often no height is provided but there is a label in
2664                     # format like "1080p", "720p SD", or 1080.
2665                     height = int_or_none(self._search_regex(
2666                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2667                         'height', default=None))
2668                 a_format = {
2669                     'url': source_url,
2670                     'width': int_or_none(source.get('width')),
2671                     'height': height,
2672                     'tbr': int_or_none(source.get('bitrate')),
2673                     'ext': ext,
2674                 }
2675                 if source_url.startswith('rtmp'):
2676                     a_format['ext'] = 'flv'
2677                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2678                     # of jwplayer.flash.swf
2679                     rtmp_url_parts = re.split(
2680                         r'((?:mp4|mp3|flv):)', source_url, 1)
2681                     if len(rtmp_url_parts) == 3:
2682                         rtmp_url, prefix, play_path = rtmp_url_parts
2683                         a_format.update({
2684                             'url': rtmp_url,
2685                             'play_path': prefix + play_path,
2686                         })
2687                     if rtmp_params:
2688                         a_format.update(rtmp_params)
2689                 formats.append(a_format)
2690         return formats
2691
2692     def _live_title(self, name):
2693         """ Generate the title for a live video """
2694         now = datetime.datetime.now()
2695         now_str = now.strftime('%Y-%m-%d %H:%M')
2696         return name + ' ' + now_str
2697
2698     def _int(self, v, name, fatal=False, **kwargs):
2699         res = int_or_none(v, **kwargs)
2700         if 'get_attr' in kwargs:
2701             print(getattr(v, kwargs['get_attr']))
2702         if res is None:
2703             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2704             if fatal:
2705                 raise ExtractorError(msg)
2706             else:
2707                 self._downloader.report_warning(msg)
2708         return res
2709
2710     def _float(self, v, name, fatal=False, **kwargs):
2711         res = float_or_none(v, **kwargs)
2712         if res is None:
2713             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2714             if fatal:
2715                 raise ExtractorError(msg)
2716             else:
2717                 self._downloader.report_warning(msg)
2718         return res
2719
2720     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2721                     path='/', secure=False, discard=False, rest={}, **kwargs):
2722         cookie = compat_cookiejar.Cookie(
2723             0, name, value, port, port is not None, domain, True,
2724             domain.startswith('.'), path, True, secure, expire_time,
2725             discard, None, None, rest)
2726         self._downloader.cookiejar.set_cookie(cookie)
2727
2728     def _get_cookies(self, url):
2729         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2730         req = sanitized_Request(url)
2731         self._downloader.cookiejar.add_cookie_header(req)
2732         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2733
2734     def get_testcases(self, include_onlymatching=False):
2735         t = getattr(self, '_TEST', None)
2736         if t:
2737             assert not hasattr(self, '_TESTS'), \
2738                 '%s has _TEST and _TESTS' % type(self).__name__
2739             tests = [t]
2740         else:
2741             tests = getattr(self, '_TESTS', [])
2742         for t in tests:
2743             if not include_onlymatching and t.get('only_matching', False):
2744                 continue
2745             t['name'] = type(self).__name__[:-len('IE')]
2746             yield t
2747
2748     def is_suitable(self, age_limit):
2749         """ Test whether the extractor is generally suitable for the given
2750         age limit (i.e. pornographic sites are not, all others usually are) """
2751
2752         any_restricted = False
2753         for tc in self.get_testcases(include_onlymatching=False):
2754             if tc.get('playlist', []):
2755                 tc = tc['playlist'][0]
2756             is_restricted = age_restricted(
2757                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2758             if not is_restricted:
2759                 return True
2760             any_restricted = any_restricted or is_restricted
2761         return not any_restricted
2762
2763     def extract_subtitles(self, *args, **kwargs):
2764         if (self._downloader.params.get('writesubtitles', False) or
2765                 self._downloader.params.get('listsubtitles')):
2766             return self._get_subtitles(*args, **kwargs)
2767         return {}
2768
2769     def _get_subtitles(self, *args, **kwargs):
2770         raise NotImplementedError('This method must be implemented by subclasses')
2771
2772     @staticmethod
2773     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2774         """ Merge subtitle items for one language. Items with duplicated URLs
2775         will be dropped. """
2776         list1_urls = set([item['url'] for item in subtitle_list1])
2777         ret = list(subtitle_list1)
2778         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2779         return ret
2780
2781     @classmethod
2782     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2783         """ Merge two subtitle dictionaries, language by language. """
2784         ret = dict(subtitle_dict1)
2785         for lang in subtitle_dict2:
2786             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2787         return ret
2788
2789     def extract_automatic_captions(self, *args, **kwargs):
2790         if (self._downloader.params.get('writeautomaticsub', False) or
2791                 self._downloader.params.get('listsubtitles')):
2792             return self._get_automatic_captions(*args, **kwargs)
2793         return {}
2794
2795     def _get_automatic_captions(self, *args, **kwargs):
2796         raise NotImplementedError('This method must be implemented by subclasses')
2797
2798     def mark_watched(self, *args, **kwargs):
2799         if (self._downloader.params.get('mark_watched', False) and
2800                 (self._get_login_info()[0] is not None or
2801                     self._downloader.params.get('cookiefile') is not None)):
2802             self._mark_watched(*args, **kwargs)
2803
2804     def _mark_watched(self, *args, **kwargs):
2805         raise NotImplementedError('This method must be implemented by subclasses')
2806
2807     def geo_verification_headers(self):
2808         headers = {}
2809         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2810         if geo_verification_proxy:
2811             headers['Ytdl-request-proxy'] = geo_verification_proxy
2812         return headers
2813
2814     def _generic_id(self, url):
2815         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2816
2817     def _generic_title(self, url):
2818         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2819
2820
2821 class SearchInfoExtractor(InfoExtractor):
2822     """
2823     Base class for paged search queries extractors.
2824     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2825     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2826     """
2827
2828     @classmethod
2829     def _make_valid_url(cls):
2830         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2831
2832     @classmethod
2833     def suitable(cls, url):
2834         return re.match(cls._make_valid_url(), url) is not None
2835
2836     def _real_extract(self, query):
2837         mobj = re.match(self._make_valid_url(), query)
2838         if mobj is None:
2839             raise ExtractorError('Invalid search query "%s"' % query)
2840
2841         prefix = mobj.group('prefix')
2842         query = mobj.group('query')
2843         if prefix == '':
2844             return self._get_n_results(query, 1)
2845         elif prefix == 'all':
2846             return self._get_n_results(query, self._MAX_RESULTS)
2847         else:
2848             n = int(prefix)
2849             if n <= 0:
2850                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2851             elif n > self._MAX_RESULTS:
2852                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2853                 n = self._MAX_RESULTS
2854             return self._get_n_results(query, n)
2855
2856     def _get_n_results(self, query, n):
2857         """Get a specified number of results for a query"""
2858         raise NotImplementedError('This method must be implemented by subclasses')
2859
2860     @property
2861     def SEARCH_KEY(self):
2862         return self._SEARCH_KEY