[utils] Share JSON-LD regex
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     xpath_element,
73     xpath_text,
74     xpath_with_ns,
75 )
76
77
78 class InfoExtractor(object):
79     """Information Extractor class.
80
81     Information extractors are the classes that, given a URL, extract
82     information about the video (or videos) the URL refers to. This
83     information includes the real video URL, the video title, author and
84     others. The information is stored in a dictionary which is then
85     passed to the YoutubeDL. The YoutubeDL processes this
86     information possibly downloading the video to the file system, among
87     other possible outcomes.
88
89     The type field determines the type of the result.
90     By far the most common value (and the default if _type is missing) is
91     "video", which indicates a single video.
92
93     For a video, the dictionaries must include the following fields:
94
95     id:             Video identifier.
96     title:          Video title, unescaped.
97
98     Additionally, it must contain either a formats entry or a url one:
99
100     formats:        A list of dictionaries for each format available, ordered
101                     from worst to best quality.
102
103                     Potential fields:
104                     * url        Mandatory. The URL of the video file
105                     * manifest_url
106                                  The URL of the manifest file in case of
107                                  fragmented media (DASH, hls, hds)
108                     * ext        Will be calculated from URL if missing
109                     * format     A human-readable description of the format
110                                  ("mp4 container with h264/opus").
111                                  Calculated from the format_id, width, height.
112                                  and format_note fields if missing.
113                     * format_id  A short description of the format
114                                  ("mp4_h264_opus" or "19").
115                                 Technically optional, but strongly recommended.
116                     * format_note Additional info about the format
117                                  ("3D" or "DASH video")
118                     * width      Width of the video, if known
119                     * height     Height of the video, if known
120                     * resolution Textual description of width and height
121                     * tbr        Average bitrate of audio and video in KBit/s
122                     * abr        Average audio bitrate in KBit/s
123                     * acodec     Name of the audio codec in use
124                     * asr        Audio sampling rate in Hertz
125                     * vbr        Average video bitrate in KBit/s
126                     * fps        Frame rate
127                     * vcodec     Name of the video codec in use
128                     * container  Name of the container format
129                     * filesize   The number of bytes, if known in advance
130                     * filesize_approx  An estimate for the number of bytes
131                     * player_url SWF Player URL (used for rtmpdump).
132                     * protocol   The protocol that will be used for the actual
133                                  download, lower-case.
134                                  "http", "https", "rtsp", "rtmp", "rtmpe",
135                                  "m3u8", "m3u8_native" or "http_dash_segments".
136                     * fragment_base_url
137                                  Base URL for fragments. Each fragment's path
138                                  value (if present) will be relative to
139                                  this URL.
140                     * fragments  A list of fragments of a fragmented media.
141                                  Each fragment entry must contain either an url
142                                  or a path. If an url is present it should be
143                                  considered by a client. Otherwise both path and
144                                  fragment_base_url must be present. Here is
145                                  the list of all potential fields:
146                                  * "url" - fragment's URL
147                                  * "path" - fragment's path relative to
148                                             fragment_base_url
149                                  * "duration" (optional, int or float)
150                                  * "filesize" (optional, int)
151                     * preference Order number of this format. If this field is
152                                  present and not None, the formats get sorted
153                                  by this field, regardless of all other values.
154                                  -1 for default (order by other properties),
155                                  -2 or smaller for less than default.
156                                  < -1000 to hide the format (if there is
157                                     another one which is strictly better)
158                     * language   Language code, e.g. "de" or "en-US".
159                     * language_preference  Is this in the language mentioned in
160                                  the URL?
161                                  10 if it's what the URL is about,
162                                  -1 for default (don't know),
163                                  -10 otherwise, other values reserved for now.
164                     * quality    Order number of the video quality of this
165                                  format, irrespective of the file format.
166                                  -1 for default (order by other properties),
167                                  -2 or smaller for less than default.
168                     * source_preference  Order number for this video source
169                                   (quality takes higher priority)
170                                  -1 for default (order by other properties),
171                                  -2 or smaller for less than default.
172                     * http_headers  A dictionary of additional HTTP headers
173                                  to add to the request.
174                     * stretched_ratio  If given and not 1, indicates that the
175                                  video's pixels are not square.
176                                  width : height ratio as float.
177                     * no_resume  The server does not support resuming the
178                                  (HTTP or RTMP) download. Boolean.
179                     * downloader_options  A dictionary of downloader options as
180                                  described in FileDownloader
181
182     url:            Final video URL.
183     ext:            Video filename extension.
184     format:         The video format, defaults to ext (used for --get-format)
185     player_url:     SWF Player URL (used for rtmpdump).
186
187     The following fields are optional:
188
189     alt_title:      A secondary title of the video.
190     display_id      An alternative identifier for the video, not necessarily
191                     unique, but available before title. Typically, id is
192                     something like "4234987", title "Dancing naked mole rats",
193                     and display_id "dancing-naked-mole-rats"
194     thumbnails:     A list of dictionaries, with the following entries:
195                         * "id" (optional, string) - Thumbnail format ID
196                         * "url"
197                         * "preference" (optional, int) - quality of the image
198                         * "width" (optional, int)
199                         * "height" (optional, int)
200                         * "resolution" (optional, string "{width}x{height"},
201                                         deprecated)
202                         * "filesize" (optional, int)
203     thumbnail:      Full URL to a video thumbnail image.
204     description:    Full video description.
205     uploader:       Full name of the video uploader.
206     license:        License name the video is licensed under.
207     creator:        The creator of the video.
208     release_date:   The date (YYYYMMDD) when the video was released.
209     timestamp:      UNIX timestamp of the moment the video became available.
210     upload_date:    Video upload date (YYYYMMDD).
211                     If not explicitly set, calculated from timestamp.
212     uploader_id:    Nickname or id of the video uploader.
213     uploader_url:   Full URL to a personal webpage of the video uploader.
214     location:       Physical location where the video was filmed.
215     subtitles:      The available subtitles as a dictionary in the format
216                     {tag: subformats}. "tag" is usually a language code, and
217                     "subformats" is a list sorted from lower to higher
218                     preference, each element is a dictionary with the "ext"
219                     entry and one of:
220                         * "data": The subtitles file contents
221                         * "url": A URL pointing to the subtitles file
222                     "ext" will be calculated from URL if missing
223     automatic_captions: Like 'subtitles', used by the YoutubeIE for
224                     automatically generated captions
225     duration:       Length of the video in seconds, as an integer or float.
226     view_count:     How many users have watched the video on the platform.
227     like_count:     Number of positive ratings of the video
228     dislike_count:  Number of negative ratings of the video
229     repost_count:   Number of reposts of the video
230     average_rating: Average rating give by users, the scale used depends on the webpage
231     comment_count:  Number of comments on the video
232     comments:       A list of comments, each with one or more of the following
233                     properties (all but one of text or html optional):
234                         * "author" - human-readable name of the comment author
235                         * "author_id" - user ID of the comment author
236                         * "id" - Comment ID
237                         * "html" - Comment as HTML
238                         * "text" - Plain text of the comment
239                         * "timestamp" - UNIX timestamp of comment
240                         * "parent" - ID of the comment this one is replying to.
241                                      Set to "root" to indicate that this is a
242                                      comment to the original video.
243     age_limit:      Age restriction for the video, as an integer (years)
244     webpage_url:    The URL to the video webpage, if given to youtube-dl it
245                     should allow to get the same result again. (It will be set
246                     by YoutubeDL if it's missing)
247     categories:     A list of categories that the video falls in, for example
248                     ["Sports", "Berlin"]
249     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
250     is_live:        True, False, or None (=unknown). Whether this video is a
251                     live stream that goes on instead of a fixed-length video.
252     start_time:     Time in seconds where the reproduction should start, as
253                     specified in the URL.
254     end_time:       Time in seconds where the reproduction should end, as
255                     specified in the URL.
256     chapters:       A list of dictionaries, with the following entries:
257                         * "start_time" - The start time of the chapter in seconds
258                         * "end_time" - The end time of the chapter in seconds
259                         * "title" (optional, string)
260
261     The following fields should only be used when the video belongs to some logical
262     chapter or section:
263
264     chapter:        Name or title of the chapter the video belongs to.
265     chapter_number: Number of the chapter the video belongs to, as an integer.
266     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
267
268     The following fields should only be used when the video is an episode of some
269     series, programme or podcast:
270
271     series:         Title of the series or programme the video episode belongs to.
272     season:         Title of the season the video episode belongs to.
273     season_number:  Number of the season the video episode belongs to, as an integer.
274     season_id:      Id of the season the video episode belongs to, as a unicode string.
275     episode:        Title of the video episode. Unlike mandatory video title field,
276                     this field should denote the exact title of the video episode
277                     without any kind of decoration.
278     episode_number: Number of the video episode within a season, as an integer.
279     episode_id:     Id of the video episode, as a unicode string.
280
281     The following fields should only be used when the media is a track or a part of
282     a music album:
283
284     track:          Title of the track.
285     track_number:   Number of the track within an album or a disc, as an integer.
286     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
287                     as a unicode string.
288     artist:         Artist(s) of the track.
289     genre:          Genre(s) of the track.
290     album:          Title of the album the track belongs to.
291     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
292     album_artist:   List of all artists appeared on the album (e.g.
293                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
294                     and compilations).
295     disc_number:    Number of the disc or other physical medium the track belongs to,
296                     as an integer.
297     release_year:   Year (YYYY) when the album was released.
298
299     Unless mentioned otherwise, the fields should be Unicode strings.
300
301     Unless mentioned otherwise, None is equivalent to absence of information.
302
303
304     _type "playlist" indicates multiple videos.
305     There must be a key "entries", which is a list, an iterable, or a PagedList
306     object, each element of which is a valid dictionary by this specification.
307
308     Additionally, playlists can have "id", "title", "description", "uploader",
309     "uploader_id", "uploader_url" attributes with the same semantics as videos
310     (see above).
311
312
313     _type "multi_video" indicates that there are multiple videos that
314     form a single show, for examples multiple acts of an opera or TV episode.
315     It must have an entries key like a playlist and contain all the keys
316     required for a video at the same time.
317
318
319     _type "url" indicates that the video must be extracted from another
320     location, possibly by a different extractor. Its only required key is:
321     "url" - the next URL to extract.
322     The key "ie_key" can be set to the class name (minus the trailing "IE",
323     e.g. "Youtube") if the extractor class is known in advance.
324     Additionally, the dictionary may have any properties of the resolved entity
325     known in advance, for example "title" if the title of the referred video is
326     known ahead of time.
327
328
329     _type "url_transparent" entities have the same specification as "url", but
330     indicate that the given additional information is more precise than the one
331     associated with the resolved URL.
332     This is useful when a site employs a video service that hosts the video and
333     its technical metadata, but that video service does not embed a useful
334     title, description etc.
335
336
337     Subclasses of this one should re-define the _real_initialize() and
338     _real_extract() methods and define a _VALID_URL regexp.
339     Probably, they should also be added to the list of extractors.
340
341     _GEO_BYPASS attribute may be set to False in order to disable
342     geo restriction bypass mechanisms for a particular extractor.
343     Though it won't disable explicit geo restriction bypass based on
344     country code provided with geo_bypass_country.
345
346     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
347     countries for this extractor. One of these countries will be used by
348     geo restriction bypass mechanism right away in order to bypass
349     geo restriction, of course, if the mechanism is not disabled.
350
351     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
352     IP blocks in CIDR notation for this extractor. One of these IP blocks
353     will be used by geo restriction bypass mechanism similarly
354     to _GEO_COUNTRIES.
355
356     Finally, the _WORKING attribute should be set to False for broken IEs
357     in order to warn the users and skip the tests.
358     """
359
360     _ready = False
361     _downloader = None
362     _x_forwarded_for_ip = None
363     _GEO_BYPASS = True
364     _GEO_COUNTRIES = None
365     _GEO_IP_BLOCKS = None
366     _WORKING = True
367
368     def __init__(self, downloader=None):
369         """Constructor. Receives an optional downloader."""
370         self._ready = False
371         self._x_forwarded_for_ip = None
372         self.set_downloader(downloader)
373
374     @classmethod
375     def suitable(cls, url):
376         """Receives a URL and returns True if suitable for this IE."""
377
378         # This does not use has/getattr intentionally - we want to know whether
379         # we have cached the regexp for *this* class, whereas getattr would also
380         # match the superclass
381         if '_VALID_URL_RE' not in cls.__dict__:
382             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
383         return cls._VALID_URL_RE.match(url) is not None
384
385     @classmethod
386     def _match_id(cls, url):
387         if '_VALID_URL_RE' not in cls.__dict__:
388             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
389         m = cls._VALID_URL_RE.match(url)
390         assert m
391         return compat_str(m.group('id'))
392
393     @classmethod
394     def working(cls):
395         """Getter method for _WORKING."""
396         return cls._WORKING
397
398     def initialize(self):
399         """Initializes an instance (authentication, etc)."""
400         self._initialize_geo_bypass({
401             'countries': self._GEO_COUNTRIES,
402             'ip_blocks': self._GEO_IP_BLOCKS,
403         })
404         if not self._ready:
405             self._real_initialize()
406             self._ready = True
407
408     def _initialize_geo_bypass(self, geo_bypass_context):
409         """
410         Initialize geo restriction bypass mechanism.
411
412         This method is used to initialize geo bypass mechanism based on faking
413         X-Forwarded-For HTTP header. A random country from provided country list
414         is selected and a random IP belonging to this country is generated. This
415         IP will be passed as X-Forwarded-For HTTP header in all subsequent
416         HTTP requests.
417
418         This method will be used for initial geo bypass mechanism initialization
419         during the instance initialization with _GEO_COUNTRIES and
420         _GEO_IP_BLOCKS.
421
422         You may also manually call it from extractor's code if geo bypass
423         information is not available beforehand (e.g. obtained during
424         extraction) or due to some other reason. In this case you should pass
425         this information in geo bypass context passed as first argument. It may
426         contain following fields:
427
428         countries:  List of geo unrestricted countries (similar
429                     to _GEO_COUNTRIES)
430         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
431                     (similar to _GEO_IP_BLOCKS)
432
433         """
434         if not self._x_forwarded_for_ip:
435
436             # Geo bypass mechanism is explicitly disabled by user
437             if not self._downloader.params.get('geo_bypass', True):
438                 return
439
440             if not geo_bypass_context:
441                 geo_bypass_context = {}
442
443             # Backward compatibility: previously _initialize_geo_bypass
444             # expected a list of countries, some 3rd party code may still use
445             # it this way
446             if isinstance(geo_bypass_context, (list, tuple)):
447                 geo_bypass_context = {
448                     'countries': geo_bypass_context,
449                 }
450
451             # The whole point of geo bypass mechanism is to fake IP
452             # as X-Forwarded-For HTTP header based on some IP block or
453             # country code.
454
455             # Path 1: bypassing based on IP block in CIDR notation
456
457             # Explicit IP block specified by user, use it right away
458             # regardless of whether extractor is geo bypassable or not
459             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
460
461             # Otherwise use random IP block from geo bypass context but only
462             # if extractor is known as geo bypassable
463             if not ip_block:
464                 ip_blocks = geo_bypass_context.get('ip_blocks')
465                 if self._GEO_BYPASS and ip_blocks:
466                     ip_block = random.choice(ip_blocks)
467
468             if ip_block:
469                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
470                 if self._downloader.params.get('verbose', False):
471                     self._downloader.to_screen(
472                         '[debug] Using fake IP %s as X-Forwarded-For.'
473                         % self._x_forwarded_for_ip)
474                 return
475
476             # Path 2: bypassing based on country code
477
478             # Explicit country code specified by user, use it right away
479             # regardless of whether extractor is geo bypassable or not
480             country = self._downloader.params.get('geo_bypass_country', None)
481
482             # Otherwise use random country code from geo bypass context but
483             # only if extractor is known as geo bypassable
484             if not country:
485                 countries = geo_bypass_context.get('countries')
486                 if self._GEO_BYPASS and countries:
487                     country = random.choice(countries)
488
489             if country:
490                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
491                 if self._downloader.params.get('verbose', False):
492                     self._downloader.to_screen(
493                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
494                         % (self._x_forwarded_for_ip, country.upper()))
495
496     def extract(self, url):
497         """Extracts URL information and returns it in list of dicts."""
498         try:
499             for _ in range(2):
500                 try:
501                     self.initialize()
502                     ie_result = self._real_extract(url)
503                     if self._x_forwarded_for_ip:
504                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
505                     return ie_result
506                 except GeoRestrictedError as e:
507                     if self.__maybe_fake_ip_and_retry(e.countries):
508                         continue
509                     raise
510         except ExtractorError:
511             raise
512         except compat_http_client.IncompleteRead as e:
513             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
514         except (KeyError, StopIteration) as e:
515             raise ExtractorError('An extractor error has occurred.', cause=e)
516
517     def __maybe_fake_ip_and_retry(self, countries):
518         if (not self._downloader.params.get('geo_bypass_country', None) and
519                 self._GEO_BYPASS and
520                 self._downloader.params.get('geo_bypass', True) and
521                 not self._x_forwarded_for_ip and
522                 countries):
523             country_code = random.choice(countries)
524             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
525             if self._x_forwarded_for_ip:
526                 self.report_warning(
527                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
528                     % (self._x_forwarded_for_ip, country_code.upper()))
529                 return True
530         return False
531
532     def set_downloader(self, downloader):
533         """Sets the downloader for this IE."""
534         self._downloader = downloader
535
536     def _real_initialize(self):
537         """Real initialization process. Redefine in subclasses."""
538         pass
539
540     def _real_extract(self, url):
541         """Real extraction process. Redefine in subclasses."""
542         pass
543
544     @classmethod
545     def ie_key(cls):
546         """A string for getting the InfoExtractor with get_info_extractor"""
547         return compat_str(cls.__name__[:-2])
548
549     @property
550     def IE_NAME(self):
551         return compat_str(type(self).__name__[:-2])
552
553     @staticmethod
554     def __can_accept_status_code(err, expected_status):
555         assert isinstance(err, compat_urllib_error.HTTPError)
556         if expected_status is None:
557             return False
558         if isinstance(expected_status, compat_integer_types):
559             return err.code == expected_status
560         elif isinstance(expected_status, (list, tuple)):
561             return err.code in expected_status
562         elif callable(expected_status):
563             return expected_status(err.code) is True
564         else:
565             assert False
566
567     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
568         """
569         Return the response handle.
570
571         See _download_webpage docstring for arguments specification.
572         """
573         if note is None:
574             self.report_download_webpage(video_id)
575         elif note is not False:
576             if video_id is None:
577                 self.to_screen('%s' % (note,))
578             else:
579                 self.to_screen('%s: %s' % (video_id, note))
580
581         # Some sites check X-Forwarded-For HTTP header in order to figure out
582         # the origin of the client behind proxy. This allows bypassing geo
583         # restriction by faking this header's value to IP that belongs to some
584         # geo unrestricted country. We will do so once we encounter any
585         # geo restriction error.
586         if self._x_forwarded_for_ip:
587             if 'X-Forwarded-For' not in headers:
588                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
589
590         if isinstance(url_or_request, compat_urllib_request.Request):
591             url_or_request = update_Request(
592                 url_or_request, data=data, headers=headers, query=query)
593         else:
594             if query:
595                 url_or_request = update_url_query(url_or_request, query)
596             if data is not None or headers:
597                 url_or_request = sanitized_Request(url_or_request, data, headers)
598         try:
599             return self._downloader.urlopen(url_or_request)
600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
601             if isinstance(err, compat_urllib_error.HTTPError):
602                 if self.__can_accept_status_code(err, expected_status):
603                     return err.fp
604
605             if errnote is False:
606                 return False
607             if errnote is None:
608                 errnote = 'Unable to download webpage'
609
610             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
611             if fatal:
612                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
613             else:
614                 self._downloader.report_warning(errmsg)
615                 return False
616
617     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
618         """
619         Return a tuple (page content as string, URL handle).
620
621         See _download_webpage docstring for arguments specification.
622         """
623         # Strip hashes from the URL (#1038)
624         if isinstance(url_or_request, (compat_str, str)):
625             url_or_request = url_or_request.partition('#')[0]
626
627         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
628         if urlh is False:
629             assert not fatal
630             return False
631         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
632         return (content, urlh)
633
634     @staticmethod
635     def _guess_encoding_from_content(content_type, webpage_bytes):
636         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
637         if m:
638             encoding = m.group(1)
639         else:
640             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
641                           webpage_bytes[:1024])
642             if m:
643                 encoding = m.group(1).decode('ascii')
644             elif webpage_bytes.startswith(b'\xff\xfe'):
645                 encoding = 'utf-16'
646             else:
647                 encoding = 'utf-8'
648
649         return encoding
650
651     def __check_blocked(self, content):
652         first_block = content[:512]
653         if ('<title>Access to this site is blocked</title>' in content and
654                 'Websense' in first_block):
655             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
656             blocked_iframe = self._html_search_regex(
657                 r'<iframe src="([^"]+)"', content,
658                 'Websense information URL', default=None)
659             if blocked_iframe:
660                 msg += ' Visit %s for more details' % blocked_iframe
661             raise ExtractorError(msg, expected=True)
662         if '<title>The URL you requested has been blocked</title>' in first_block:
663             msg = (
664                 'Access to this webpage has been blocked by Indian censorship. '
665                 'Use a VPN or proxy server (with --proxy) to route around it.')
666             block_msg = self._html_search_regex(
667                 r'</h1><p>(.*?)</p>',
668                 content, 'block message', default=None)
669             if block_msg:
670                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
671             raise ExtractorError(msg, expected=True)
672         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
673                 'blocklist.rkn.gov.ru' in content):
674             raise ExtractorError(
675                 'Access to this webpage has been blocked by decision of the Russian government. '
676                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
677                 expected=True)
678
679     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
680         content_type = urlh.headers.get('Content-Type', '')
681         webpage_bytes = urlh.read()
682         if prefix is not None:
683             webpage_bytes = prefix + webpage_bytes
684         if not encoding:
685             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
686         if self._downloader.params.get('dump_intermediate_pages', False):
687             self.to_screen('Dumping request to ' + urlh.geturl())
688             dump = base64.b64encode(webpage_bytes).decode('ascii')
689             self._downloader.to_screen(dump)
690         if self._downloader.params.get('write_pages', False):
691             basen = '%s_%s' % (video_id, urlh.geturl())
692             if len(basen) > 240:
693                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
694                 basen = basen[:240 - len(h)] + h
695             raw_filename = basen + '.dump'
696             filename = sanitize_filename(raw_filename, restricted=True)
697             self.to_screen('Saving request to ' + filename)
698             # Working around MAX_PATH limitation on Windows (see
699             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
700             if compat_os_name == 'nt':
701                 absfilepath = os.path.abspath(filename)
702                 if len(absfilepath) > 259:
703                     filename = '\\\\?\\' + absfilepath
704             with open(filename, 'wb') as outf:
705                 outf.write(webpage_bytes)
706
707         try:
708             content = webpage_bytes.decode(encoding, 'replace')
709         except LookupError:
710             content = webpage_bytes.decode('utf-8', 'replace')
711
712         self.__check_blocked(content)
713
714         return content
715
716     def _download_webpage(
717             self, url_or_request, video_id, note=None, errnote=None,
718             fatal=True, tries=1, timeout=5, encoding=None, data=None,
719             headers={}, query={}, expected_status=None):
720         """
721         Return the data of the page as a string.
722
723         Arguments:
724         url_or_request -- plain text URL as a string or
725             a compat_urllib_request.Requestobject
726         video_id -- Video/playlist/item identifier (string)
727
728         Keyword arguments:
729         note -- note printed before downloading (string)
730         errnote -- note printed in case of an error (string)
731         fatal -- flag denoting whether error should be considered fatal,
732             i.e. whether it should cause ExtractionError to be raised,
733             otherwise a warning will be reported and extraction continued
734         tries -- number of tries
735         timeout -- sleep interval between tries
736         encoding -- encoding for a page content decoding, guessed automatically
737             when not explicitly specified
738         data -- POST data (bytes)
739         headers -- HTTP headers (dict)
740         query -- URL query (dict)
741         expected_status -- allows to accept failed HTTP requests (non 2xx
742             status code) by explicitly specifying a set of accepted status
743             codes. Can be any of the following entities:
744                 - an integer type specifying an exact failed status code to
745                   accept
746                 - a list or a tuple of integer types specifying a list of
747                   failed status codes to accept
748                 - a callable accepting an actual failed status code and
749                   returning True if it should be accepted
750             Note that this argument does not affect success status codes (2xx)
751             which are always accepted.
752         """
753
754         success = False
755         try_count = 0
756         while success is False:
757             try:
758                 res = self._download_webpage_handle(
759                     url_or_request, video_id, note, errnote, fatal,
760                     encoding=encoding, data=data, headers=headers, query=query,
761                     expected_status=expected_status)
762                 success = True
763             except compat_http_client.IncompleteRead as e:
764                 try_count += 1
765                 if try_count >= tries:
766                     raise e
767                 self._sleep(timeout, video_id)
768         if res is False:
769             return res
770         else:
771             content, _ = res
772             return content
773
774     def _download_xml_handle(
775             self, url_or_request, video_id, note='Downloading XML',
776             errnote='Unable to download XML', transform_source=None,
777             fatal=True, encoding=None, data=None, headers={}, query={},
778             expected_status=None):
779         """
780         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
781
782         See _download_webpage docstring for arguments specification.
783         """
784         res = self._download_webpage_handle(
785             url_or_request, video_id, note, errnote, fatal=fatal,
786             encoding=encoding, data=data, headers=headers, query=query,
787             expected_status=expected_status)
788         if res is False:
789             return res
790         xml_string, urlh = res
791         return self._parse_xml(
792             xml_string, video_id, transform_source=transform_source,
793             fatal=fatal), urlh
794
795     def _download_xml(
796             self, url_or_request, video_id,
797             note='Downloading XML', errnote='Unable to download XML',
798             transform_source=None, fatal=True, encoding=None,
799             data=None, headers={}, query={}, expected_status=None):
800         """
801         Return the xml as an xml.etree.ElementTree.Element.
802
803         See _download_webpage docstring for arguments specification.
804         """
805         res = self._download_xml_handle(
806             url_or_request, video_id, note=note, errnote=errnote,
807             transform_source=transform_source, fatal=fatal, encoding=encoding,
808             data=data, headers=headers, query=query,
809             expected_status=expected_status)
810         return res if res is False else res[0]
811
812     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
813         if transform_source:
814             xml_string = transform_source(xml_string)
815         try:
816             return compat_etree_fromstring(xml_string.encode('utf-8'))
817         except compat_xml_parse_error as ve:
818             errmsg = '%s: Failed to parse XML ' % video_id
819             if fatal:
820                 raise ExtractorError(errmsg, cause=ve)
821             else:
822                 self.report_warning(errmsg + str(ve))
823
824     def _download_json_handle(
825             self, url_or_request, video_id, note='Downloading JSON metadata',
826             errnote='Unable to download JSON metadata', transform_source=None,
827             fatal=True, encoding=None, data=None, headers={}, query={},
828             expected_status=None):
829         """
830         Return a tuple (JSON object, URL handle).
831
832         See _download_webpage docstring for arguments specification.
833         """
834         res = self._download_webpage_handle(
835             url_or_request, video_id, note, errnote, fatal=fatal,
836             encoding=encoding, data=data, headers=headers, query=query,
837             expected_status=expected_status)
838         if res is False:
839             return res
840         json_string, urlh = res
841         return self._parse_json(
842             json_string, video_id, transform_source=transform_source,
843             fatal=fatal), urlh
844
845     def _download_json(
846             self, url_or_request, video_id, note='Downloading JSON metadata',
847             errnote='Unable to download JSON metadata', transform_source=None,
848             fatal=True, encoding=None, data=None, headers={}, query={},
849             expected_status=None):
850         """
851         Return the JSON object as a dict.
852
853         See _download_webpage docstring for arguments specification.
854         """
855         res = self._download_json_handle(
856             url_or_request, video_id, note=note, errnote=errnote,
857             transform_source=transform_source, fatal=fatal, encoding=encoding,
858             data=data, headers=headers, query=query,
859             expected_status=expected_status)
860         return res if res is False else res[0]
861
862     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
863         if transform_source:
864             json_string = transform_source(json_string)
865         try:
866             return json.loads(json_string)
867         except ValueError as ve:
868             errmsg = '%s: Failed to parse JSON ' % video_id
869             if fatal:
870                 raise ExtractorError(errmsg, cause=ve)
871             else:
872                 self.report_warning(errmsg + str(ve))
873
874     def report_warning(self, msg, video_id=None):
875         idstr = '' if video_id is None else '%s: ' % video_id
876         self._downloader.report_warning(
877             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
878
879     def to_screen(self, msg):
880         """Print msg to screen, prefixing it with '[ie_name]'"""
881         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
882
883     def report_extraction(self, id_or_name):
884         """Report information extraction."""
885         self.to_screen('%s: Extracting information' % id_or_name)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self.to_screen('%s: Downloading webpage' % video_id)
890
891     def report_age_confirmation(self):
892         """Report attempt to confirm age."""
893         self.to_screen('Confirming age')
894
895     def report_login(self):
896         """Report attempt to log in."""
897         self.to_screen('Logging in')
898
899     @staticmethod
900     def raise_login_required(msg='This video is only available for registered users'):
901         raise ExtractorError(
902             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
903             expected=True)
904
905     @staticmethod
906     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
907         raise GeoRestrictedError(msg, countries=countries)
908
909     # Methods for following #608
910     @staticmethod
911     def url_result(url, ie=None, video_id=None, video_title=None):
912         """Returns a URL that points to a page that should be processed"""
913         # TODO: ie should be the class used for getting the info
914         video_info = {'_type': 'url',
915                       'url': url,
916                       'ie_key': ie}
917         if video_id is not None:
918             video_info['id'] = video_id
919         if video_title is not None:
920             video_info['title'] = video_title
921         return video_info
922
923     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
924         urls = orderedSet(
925             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
926             for m in matches)
927         return self.playlist_result(
928             urls, playlist_id=playlist_id, playlist_title=playlist_title)
929
930     @staticmethod
931     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
932         """Returns a playlist"""
933         video_info = {'_type': 'playlist',
934                       'entries': entries}
935         if playlist_id:
936             video_info['id'] = playlist_id
937         if playlist_title:
938             video_info['title'] = playlist_title
939         if playlist_description:
940             video_info['description'] = playlist_description
941         return video_info
942
943     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
944         """
945         Perform a regex search on the given string, using a single or a list of
946         patterns returning the first matching group.
947         In case of failure return a default value or raise a WARNING or a
948         RegexNotFoundError, depending on fatal, specifying the field name.
949         """
950         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
951             mobj = re.search(pattern, string, flags)
952         else:
953             for p in pattern:
954                 mobj = re.search(p, string, flags)
955                 if mobj:
956                     break
957
958         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
959             _name = '\033[0;34m%s\033[0m' % name
960         else:
961             _name = name
962
963         if mobj:
964             if group is None:
965                 # return the first matching group
966                 return next(g for g in mobj.groups() if g is not None)
967             else:
968                 return mobj.group(group)
969         elif default is not NO_DEFAULT:
970             return default
971         elif fatal:
972             raise RegexNotFoundError('Unable to extract %s' % _name)
973         else:
974             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
975             return None
976
977     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
978         """
979         Like _search_regex, but strips HTML tags and unescapes entities.
980         """
981         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
982         if res:
983             return clean_html(res).strip()
984         else:
985             return res
986
987     def _get_netrc_login_info(self, netrc_machine=None):
988         username = None
989         password = None
990         netrc_machine = netrc_machine or self._NETRC_MACHINE
991
992         if self._downloader.params.get('usenetrc', False):
993             try:
994                 info = netrc.netrc().authenticators(netrc_machine)
995                 if info is not None:
996                     username = info[0]
997                     password = info[2]
998                 else:
999                     raise netrc.NetrcParseError(
1000                         'No authenticators for %s' % netrc_machine)
1001             except (IOError, netrc.NetrcParseError) as err:
1002                 self._downloader.report_warning(
1003                     'parsing .netrc: %s' % error_to_compat_str(err))
1004
1005         return username, password
1006
1007     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1008         """
1009         Get the login info as (username, password)
1010         First look for the manually specified credentials using username_option
1011         and password_option as keys in params dictionary. If no such credentials
1012         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1013         value.
1014         If there's no info available, return (None, None)
1015         """
1016         if self._downloader is None:
1017             return (None, None)
1018
1019         downloader_params = self._downloader.params
1020
1021         # Attempt to use provided username and password or .netrc data
1022         if downloader_params.get(username_option) is not None:
1023             username = downloader_params[username_option]
1024             password = downloader_params[password_option]
1025         else:
1026             username, password = self._get_netrc_login_info(netrc_machine)
1027
1028         return username, password
1029
1030     def _get_tfa_info(self, note='two-factor verification code'):
1031         """
1032         Get the two-factor authentication info
1033         TODO - asking the user will be required for sms/phone verify
1034         currently just uses the command line option
1035         If there's no info available, return None
1036         """
1037         if self._downloader is None:
1038             return None
1039         downloader_params = self._downloader.params
1040
1041         if downloader_params.get('twofactor') is not None:
1042             return downloader_params['twofactor']
1043
1044         return compat_getpass('Type %s and press [Return]: ' % note)
1045
1046     # Helper functions for extracting OpenGraph info
1047     @staticmethod
1048     def _og_regexes(prop):
1049         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1050         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1051                        % {'prop': re.escape(prop)})
1052         template = r'<meta[^>]+?%s[^>]+?%s'
1053         return [
1054             template % (property_re, content_re),
1055             template % (content_re, property_re),
1056         ]
1057
1058     @staticmethod
1059     def _meta_regex(prop):
1060         return r'''(?isx)<meta
1061                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1062                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1063
1064     def _og_search_property(self, prop, html, name=None, **kargs):
1065         if not isinstance(prop, (list, tuple)):
1066             prop = [prop]
1067         if name is None:
1068             name = 'OpenGraph %s' % prop[0]
1069         og_regexes = []
1070         for p in prop:
1071             og_regexes.extend(self._og_regexes(p))
1072         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1073         if escaped is None:
1074             return None
1075         return unescapeHTML(escaped)
1076
1077     def _og_search_thumbnail(self, html, **kargs):
1078         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1079
1080     def _og_search_description(self, html, **kargs):
1081         return self._og_search_property('description', html, fatal=False, **kargs)
1082
1083     def _og_search_title(self, html, **kargs):
1084         return self._og_search_property('title', html, **kargs)
1085
1086     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1087         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1088         if secure:
1089             regexes = self._og_regexes('video:secure_url') + regexes
1090         return self._html_search_regex(regexes, html, name, **kargs)
1091
1092     def _og_search_url(self, html, **kargs):
1093         return self._og_search_property('url', html, **kargs)
1094
1095     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1096         if not isinstance(name, (list, tuple)):
1097             name = [name]
1098         if display_name is None:
1099             display_name = name[0]
1100         return self._html_search_regex(
1101             [self._meta_regex(n) for n in name],
1102             html, display_name, fatal=fatal, group='content', **kwargs)
1103
1104     def _dc_search_uploader(self, html):
1105         return self._html_search_meta('dc.creator', html, 'uploader')
1106
1107     def _rta_search(self, html):
1108         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1109         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1110                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1111                      html):
1112             return 18
1113         return 0
1114
1115     def _media_rating_search(self, html):
1116         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1117         rating = self._html_search_meta('rating', html)
1118
1119         if not rating:
1120             return None
1121
1122         RATING_TABLE = {
1123             'safe for kids': 0,
1124             'general': 8,
1125             '14 years': 14,
1126             'mature': 17,
1127             'restricted': 19,
1128         }
1129         return RATING_TABLE.get(rating.lower())
1130
1131     def _family_friendly_search(self, html):
1132         # See http://schema.org/VideoObject
1133         family_friendly = self._html_search_meta(
1134             'isFamilyFriendly', html, default=None)
1135
1136         if not family_friendly:
1137             return None
1138
1139         RATING_TABLE = {
1140             '1': 0,
1141             'true': 0,
1142             '0': 18,
1143             'false': 18,
1144         }
1145         return RATING_TABLE.get(family_friendly.lower())
1146
1147     def _twitter_search_player(self, html):
1148         return self._html_search_meta('twitter:player', html,
1149                                       'twitter card player')
1150
1151     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1152         json_ld = self._search_regex(
1153             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1154         default = kwargs.get('default', NO_DEFAULT)
1155         if not json_ld:
1156             return default if default is not NO_DEFAULT else {}
1157         # JSON-LD may be malformed and thus `fatal` should be respected.
1158         # At the same time `default` may be passed that assumes `fatal=False`
1159         # for _search_regex. Let's simulate the same behavior here as well.
1160         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1161         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1162
1163     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1164         if isinstance(json_ld, compat_str):
1165             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1166         if not json_ld:
1167             return {}
1168         info = {}
1169         if not isinstance(json_ld, (list, tuple, dict)):
1170             return info
1171         if isinstance(json_ld, dict):
1172             json_ld = [json_ld]
1173
1174         INTERACTION_TYPE_MAP = {
1175             'CommentAction': 'comment',
1176             'AgreeAction': 'like',
1177             'DisagreeAction': 'dislike',
1178             'LikeAction': 'like',
1179             'DislikeAction': 'dislike',
1180             'ListenAction': 'view',
1181             'WatchAction': 'view',
1182             'ViewAction': 'view',
1183         }
1184
1185         def extract_interaction_statistic(e):
1186             interaction_statistic = e.get('interactionStatistic')
1187             if not isinstance(interaction_statistic, list):
1188                 return
1189             for is_e in interaction_statistic:
1190                 if not isinstance(is_e, dict):
1191                     continue
1192                 if is_e.get('@type') != 'InteractionCounter':
1193                     continue
1194                 interaction_type = is_e.get('interactionType')
1195                 if not isinstance(interaction_type, compat_str):
1196                     continue
1197                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1198                 if interaction_count is None:
1199                     continue
1200                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1201                 if not count_kind:
1202                     continue
1203                 count_key = '%s_count' % count_kind
1204                 if info.get(count_key) is not None:
1205                     continue
1206                 info[count_key] = interaction_count
1207
1208         def extract_video_object(e):
1209             assert e['@type'] == 'VideoObject'
1210             info.update({
1211                 'url': e.get('contentUrl'),
1212                 'title': unescapeHTML(e.get('name')),
1213                 'description': unescapeHTML(e.get('description')),
1214                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1215                 'duration': parse_duration(e.get('duration')),
1216                 'timestamp': unified_timestamp(e.get('uploadDate')),
1217                 'filesize': float_or_none(e.get('contentSize')),
1218                 'tbr': int_or_none(e.get('bitrate')),
1219                 'width': int_or_none(e.get('width')),
1220                 'height': int_or_none(e.get('height')),
1221                 'view_count': int_or_none(e.get('interactionCount')),
1222             })
1223             extract_interaction_statistic(e)
1224
1225         for e in json_ld:
1226             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1227                 item_type = e.get('@type')
1228                 if expected_type is not None and expected_type != item_type:
1229                     return info
1230                 if item_type in ('TVEpisode', 'Episode'):
1231                     info.update({
1232                         'episode': unescapeHTML(e.get('name')),
1233                         'episode_number': int_or_none(e.get('episodeNumber')),
1234                         'description': unescapeHTML(e.get('description')),
1235                     })
1236                     part_of_season = e.get('partOfSeason')
1237                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1238                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1239                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1240                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1241                         info['series'] = unescapeHTML(part_of_series.get('name'))
1242                 elif item_type in ('Article', 'NewsArticle'):
1243                     info.update({
1244                         'timestamp': parse_iso8601(e.get('datePublished')),
1245                         'title': unescapeHTML(e.get('headline')),
1246                         'description': unescapeHTML(e.get('articleBody')),
1247                     })
1248                 elif item_type == 'VideoObject':
1249                     extract_video_object(e)
1250                     continue
1251                 video = e.get('video')
1252                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1253                     extract_video_object(video)
1254                 break
1255         return dict((k, v) for k, v in info.items() if v is not None)
1256
1257     @staticmethod
1258     def _hidden_inputs(html):
1259         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1260         hidden_inputs = {}
1261         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1262             attrs = extract_attributes(input)
1263             if not input:
1264                 continue
1265             if attrs.get('type') not in ('hidden', 'submit'):
1266                 continue
1267             name = attrs.get('name') or attrs.get('id')
1268             value = attrs.get('value')
1269             if name and value is not None:
1270                 hidden_inputs[name] = value
1271         return hidden_inputs
1272
1273     def _form_hidden_inputs(self, form_id, html):
1274         form = self._search_regex(
1275             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1276             html, '%s form' % form_id, group='form')
1277         return self._hidden_inputs(form)
1278
1279     def _sort_formats(self, formats, field_preference=None):
1280         if not formats:
1281             raise ExtractorError('No video formats found')
1282
1283         for f in formats:
1284             # Automatically determine tbr when missing based on abr and vbr (improves
1285             # formats sorting in some cases)
1286             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1287                 f['tbr'] = f['abr'] + f['vbr']
1288
1289         def _formats_key(f):
1290             # TODO remove the following workaround
1291             from ..utils import determine_ext
1292             if not f.get('ext') and 'url' in f:
1293                 f['ext'] = determine_ext(f['url'])
1294
1295             if isinstance(field_preference, (list, tuple)):
1296                 return tuple(
1297                     f.get(field)
1298                     if f.get(field) is not None
1299                     else ('' if field == 'format_id' else -1)
1300                     for field in field_preference)
1301
1302             preference = f.get('preference')
1303             if preference is None:
1304                 preference = 0
1305                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1306                     preference -= 0.5
1307
1308             protocol = f.get('protocol') or determine_protocol(f)
1309             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1310
1311             if f.get('vcodec') == 'none':  # audio only
1312                 preference -= 50
1313                 if self._downloader.params.get('prefer_free_formats'):
1314                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1315                 else:
1316                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1317                 ext_preference = 0
1318                 try:
1319                     audio_ext_preference = ORDER.index(f['ext'])
1320                 except ValueError:
1321                     audio_ext_preference = -1
1322             else:
1323                 if f.get('acodec') == 'none':  # video only
1324                     preference -= 40
1325                 if self._downloader.params.get('prefer_free_formats'):
1326                     ORDER = ['flv', 'mp4', 'webm']
1327                 else:
1328                     ORDER = ['webm', 'flv', 'mp4']
1329                 try:
1330                     ext_preference = ORDER.index(f['ext'])
1331                 except ValueError:
1332                     ext_preference = -1
1333                 audio_ext_preference = 0
1334
1335             return (
1336                 preference,
1337                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1338                 f.get('quality') if f.get('quality') is not None else -1,
1339                 f.get('tbr') if f.get('tbr') is not None else -1,
1340                 f.get('filesize') if f.get('filesize') is not None else -1,
1341                 f.get('vbr') if f.get('vbr') is not None else -1,
1342                 f.get('height') if f.get('height') is not None else -1,
1343                 f.get('width') if f.get('width') is not None else -1,
1344                 proto_preference,
1345                 ext_preference,
1346                 f.get('abr') if f.get('abr') is not None else -1,
1347                 audio_ext_preference,
1348                 f.get('fps') if f.get('fps') is not None else -1,
1349                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1350                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1351                 f.get('format_id') if f.get('format_id') is not None else '',
1352             )
1353         formats.sort(key=_formats_key)
1354
1355     def _check_formats(self, formats, video_id):
1356         if formats:
1357             formats[:] = filter(
1358                 lambda f: self._is_valid_url(
1359                     f['url'], video_id,
1360                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1361                 formats)
1362
1363     @staticmethod
1364     def _remove_duplicate_formats(formats):
1365         format_urls = set()
1366         unique_formats = []
1367         for f in formats:
1368             if f['url'] not in format_urls:
1369                 format_urls.add(f['url'])
1370                 unique_formats.append(f)
1371         formats[:] = unique_formats
1372
1373     def _is_valid_url(self, url, video_id, item='video', headers={}):
1374         url = self._proto_relative_url(url, scheme='http:')
1375         # For now assume non HTTP(S) URLs always valid
1376         if not (url.startswith('http://') or url.startswith('https://')):
1377             return True
1378         try:
1379             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1380             return True
1381         except ExtractorError as e:
1382             if isinstance(e.cause, compat_urllib_error.URLError):
1383                 self.to_screen(
1384                     '%s: %s URL is invalid, skipping' % (video_id, item))
1385                 return False
1386             raise
1387
1388     def http_scheme(self):
1389         """ Either "http:" or "https:", depending on the user's preferences """
1390         return (
1391             'http:'
1392             if self._downloader.params.get('prefer_insecure', False)
1393             else 'https:')
1394
1395     def _proto_relative_url(self, url, scheme=None):
1396         if url is None:
1397             return url
1398         if url.startswith('//'):
1399             if scheme is None:
1400                 scheme = self.http_scheme()
1401             return scheme + url
1402         else:
1403             return url
1404
1405     def _sleep(self, timeout, video_id, msg_template=None):
1406         if msg_template is None:
1407             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1408         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1409         self.to_screen(msg)
1410         time.sleep(timeout)
1411
1412     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1413                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1414                              fatal=True, m3u8_id=None):
1415         manifest = self._download_xml(
1416             manifest_url, video_id, 'Downloading f4m manifest',
1417             'Unable to download f4m manifest',
1418             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1419             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1420             transform_source=transform_source,
1421             fatal=fatal)
1422
1423         if manifest is False:
1424             return []
1425
1426         return self._parse_f4m_formats(
1427             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1428             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1429
1430     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1431                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1432                            fatal=True, m3u8_id=None):
1433         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1434         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1435         if akamai_pv is not None and ';' in akamai_pv.text:
1436             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1437             if playerVerificationChallenge.strip() != '':
1438                 return []
1439
1440         formats = []
1441         manifest_version = '1.0'
1442         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1443         if not media_nodes:
1444             manifest_version = '2.0'
1445             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1446         # Remove unsupported DRM protected media from final formats
1447         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1448         media_nodes = remove_encrypted_media(media_nodes)
1449         if not media_nodes:
1450             return formats
1451
1452         manifest_base_url = get_base_url(manifest)
1453
1454         bootstrap_info = xpath_element(
1455             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1456             'bootstrap info', default=None)
1457
1458         vcodec = None
1459         mime_type = xpath_text(
1460             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1461             'base URL', default=None)
1462         if mime_type and mime_type.startswith('audio/'):
1463             vcodec = 'none'
1464
1465         for i, media_el in enumerate(media_nodes):
1466             tbr = int_or_none(media_el.attrib.get('bitrate'))
1467             width = int_or_none(media_el.attrib.get('width'))
1468             height = int_or_none(media_el.attrib.get('height'))
1469             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1470             # If <bootstrapInfo> is present, the specified f4m is a
1471             # stream-level manifest, and only set-level manifests may refer to
1472             # external resources.  See section 11.4 and section 4 of F4M spec
1473             if bootstrap_info is None:
1474                 media_url = None
1475                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1476                 if manifest_version == '2.0':
1477                     media_url = media_el.attrib.get('href')
1478                 if media_url is None:
1479                     media_url = media_el.attrib.get('url')
1480                 if not media_url:
1481                     continue
1482                 manifest_url = (
1483                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1484                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1485                 # If media_url is itself a f4m manifest do the recursive extraction
1486                 # since bitrates in parent manifest (this one) and media_url manifest
1487                 # may differ leading to inability to resolve the format by requested
1488                 # bitrate in f4m downloader
1489                 ext = determine_ext(manifest_url)
1490                 if ext == 'f4m':
1491                     f4m_formats = self._extract_f4m_formats(
1492                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1493                         transform_source=transform_source, fatal=fatal)
1494                     # Sometimes stream-level manifest contains single media entry that
1495                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1496                     # At the same time parent's media entry in set-level manifest may
1497                     # contain it. We will copy it from parent in such cases.
1498                     if len(f4m_formats) == 1:
1499                         f = f4m_formats[0]
1500                         f.update({
1501                             'tbr': f.get('tbr') or tbr,
1502                             'width': f.get('width') or width,
1503                             'height': f.get('height') or height,
1504                             'format_id': f.get('format_id') if not tbr else format_id,
1505                             'vcodec': vcodec,
1506                         })
1507                     formats.extend(f4m_formats)
1508                     continue
1509                 elif ext == 'm3u8':
1510                     formats.extend(self._extract_m3u8_formats(
1511                         manifest_url, video_id, 'mp4', preference=preference,
1512                         m3u8_id=m3u8_id, fatal=fatal))
1513                     continue
1514             formats.append({
1515                 'format_id': format_id,
1516                 'url': manifest_url,
1517                 'manifest_url': manifest_url,
1518                 'ext': 'flv' if bootstrap_info is not None else None,
1519                 'protocol': 'f4m',
1520                 'tbr': tbr,
1521                 'width': width,
1522                 'height': height,
1523                 'vcodec': vcodec,
1524                 'preference': preference,
1525             })
1526         return formats
1527
1528     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1529         return {
1530             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1531             'url': m3u8_url,
1532             'ext': ext,
1533             'protocol': 'm3u8',
1534             'preference': preference - 100 if preference else -100,
1535             'resolution': 'multiple',
1536             'format_note': 'Quality selection URL',
1537         }
1538
1539     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1540                               entry_protocol='m3u8', preference=None,
1541                               m3u8_id=None, note=None, errnote=None,
1542                               fatal=True, live=False):
1543         res = self._download_webpage_handle(
1544             m3u8_url, video_id,
1545             note=note or 'Downloading m3u8 information',
1546             errnote=errnote or 'Failed to download m3u8 information',
1547             fatal=fatal)
1548
1549         if res is False:
1550             return []
1551
1552         m3u8_doc, urlh = res
1553         m3u8_url = urlh.geturl()
1554
1555         return self._parse_m3u8_formats(
1556             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1557             preference=preference, m3u8_id=m3u8_id, live=live)
1558
1559     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1560                             entry_protocol='m3u8', preference=None,
1561                             m3u8_id=None, live=False):
1562         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1563             return []
1564
1565         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1566             return []
1567
1568         formats = []
1569
1570         format_url = lambda u: (
1571             u
1572             if re.match(r'^https?://', u)
1573             else compat_urlparse.urljoin(m3u8_url, u))
1574
1575         # References:
1576         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1577         # 2. https://github.com/rg3/youtube-dl/issues/12211
1578
1579         # We should try extracting formats only from master playlists [1, 4.3.4],
1580         # i.e. playlists that describe available qualities. On the other hand
1581         # media playlists [1, 4.3.3] should be returned as is since they contain
1582         # just the media without qualities renditions.
1583         # Fortunately, master playlist can be easily distinguished from media
1584         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1585         # master playlist tags MUST NOT appear in a media playist and vice versa.
1586         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1587         # media playlist and MUST NOT appear in master playlist thus we can
1588         # clearly detect media playlist with this criterion.
1589
1590         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1591             return [{
1592                 'url': m3u8_url,
1593                 'format_id': m3u8_id,
1594                 'ext': ext,
1595                 'protocol': entry_protocol,
1596                 'preference': preference,
1597             }]
1598
1599         groups = {}
1600         last_stream_inf = {}
1601
1602         def extract_media(x_media_line):
1603             media = parse_m3u8_attributes(x_media_line)
1604             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1605             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1606             if not (media_type and group_id and name):
1607                 return
1608             groups.setdefault(group_id, []).append(media)
1609             if media_type not in ('VIDEO', 'AUDIO'):
1610                 return
1611             media_url = media.get('URI')
1612             if media_url:
1613                 format_id = []
1614                 for v in (m3u8_id, group_id, name):
1615                     if v:
1616                         format_id.append(v)
1617                 f = {
1618                     'format_id': '-'.join(format_id),
1619                     'url': format_url(media_url),
1620                     'manifest_url': m3u8_url,
1621                     'language': media.get('LANGUAGE'),
1622                     'ext': ext,
1623                     'protocol': entry_protocol,
1624                     'preference': preference,
1625                 }
1626                 if media_type == 'AUDIO':
1627                     f['vcodec'] = 'none'
1628                 formats.append(f)
1629
1630         def build_stream_name():
1631             # Despite specification does not mention NAME attribute for
1632             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1633             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1634             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1635             stream_name = last_stream_inf.get('NAME')
1636             if stream_name:
1637                 return stream_name
1638             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1639             # from corresponding rendition group
1640             stream_group_id = last_stream_inf.get('VIDEO')
1641             if not stream_group_id:
1642                 return
1643             stream_group = groups.get(stream_group_id)
1644             if not stream_group:
1645                 return stream_group_id
1646             rendition = stream_group[0]
1647             return rendition.get('NAME') or stream_group_id
1648
1649         for line in m3u8_doc.splitlines():
1650             if line.startswith('#EXT-X-STREAM-INF:'):
1651                 last_stream_inf = parse_m3u8_attributes(line)
1652             elif line.startswith('#EXT-X-MEDIA:'):
1653                 extract_media(line)
1654             elif line.startswith('#') or not line.strip():
1655                 continue
1656             else:
1657                 tbr = float_or_none(
1658                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1659                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1660                 format_id = []
1661                 if m3u8_id:
1662                     format_id.append(m3u8_id)
1663                 stream_name = build_stream_name()
1664                 # Bandwidth of live streams may differ over time thus making
1665                 # format_id unpredictable. So it's better to keep provided
1666                 # format_id intact.
1667                 if not live:
1668                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1669                 manifest_url = format_url(line.strip())
1670                 f = {
1671                     'format_id': '-'.join(format_id),
1672                     'url': manifest_url,
1673                     'manifest_url': m3u8_url,
1674                     'tbr': tbr,
1675                     'ext': ext,
1676                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1677                     'protocol': entry_protocol,
1678                     'preference': preference,
1679                 }
1680                 resolution = last_stream_inf.get('RESOLUTION')
1681                 if resolution:
1682                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1683                     if mobj:
1684                         f['width'] = int(mobj.group('width'))
1685                         f['height'] = int(mobj.group('height'))
1686                 # Unified Streaming Platform
1687                 mobj = re.search(
1688                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1689                 if mobj:
1690                     abr, vbr = mobj.groups()
1691                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1692                     f.update({
1693                         'vbr': vbr,
1694                         'abr': abr,
1695                     })
1696                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1697                 f.update(codecs)
1698                 audio_group_id = last_stream_inf.get('AUDIO')
1699                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1700                 # references a rendition group MUST have a CODECS attribute.
1701                 # However, this is not always respected, for example, [2]
1702                 # contains EXT-X-STREAM-INF tag which references AUDIO
1703                 # rendition group but does not have CODECS and despite
1704                 # referencing audio group an audio group, it represents
1705                 # a complete (with audio and video) format. So, for such cases
1706                 # we will ignore references to rendition groups and treat them
1707                 # as complete formats.
1708                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1709                     audio_group = groups.get(audio_group_id)
1710                     if audio_group and audio_group[0].get('URI'):
1711                         # TODO: update acodec for audio only formats with
1712                         # the same GROUP-ID
1713                         f['acodec'] = 'none'
1714                 formats.append(f)
1715                 last_stream_inf = {}
1716         return formats
1717
1718     @staticmethod
1719     def _xpath_ns(path, namespace=None):
1720         if not namespace:
1721             return path
1722         out = []
1723         for c in path.split('/'):
1724             if not c or c == '.':
1725                 out.append(c)
1726             else:
1727                 out.append('{%s}%s' % (namespace, c))
1728         return '/'.join(out)
1729
1730     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1731         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1732
1733         if smil is False:
1734             assert not fatal
1735             return []
1736
1737         namespace = self._parse_smil_namespace(smil)
1738
1739         return self._parse_smil_formats(
1740             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1741
1742     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1743         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1744         if smil is False:
1745             return {}
1746         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1747
1748     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1749         return self._download_xml(
1750             smil_url, video_id, 'Downloading SMIL file',
1751             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1752
1753     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1754         namespace = self._parse_smil_namespace(smil)
1755
1756         formats = self._parse_smil_formats(
1757             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1758         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1759
1760         video_id = os.path.splitext(url_basename(smil_url))[0]
1761         title = None
1762         description = None
1763         upload_date = None
1764         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1765             name = meta.attrib.get('name')
1766             content = meta.attrib.get('content')
1767             if not name or not content:
1768                 continue
1769             if not title and name == 'title':
1770                 title = content
1771             elif not description and name in ('description', 'abstract'):
1772                 description = content
1773             elif not upload_date and name == 'date':
1774                 upload_date = unified_strdate(content)
1775
1776         thumbnails = [{
1777             'id': image.get('type'),
1778             'url': image.get('src'),
1779             'width': int_or_none(image.get('width')),
1780             'height': int_or_none(image.get('height')),
1781         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1782
1783         return {
1784             'id': video_id,
1785             'title': title or video_id,
1786             'description': description,
1787             'upload_date': upload_date,
1788             'thumbnails': thumbnails,
1789             'formats': formats,
1790             'subtitles': subtitles,
1791         }
1792
1793     def _parse_smil_namespace(self, smil):
1794         return self._search_regex(
1795             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1796
1797     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1798         base = smil_url
1799         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1800             b = meta.get('base') or meta.get('httpBase')
1801             if b:
1802                 base = b
1803                 break
1804
1805         formats = []
1806         rtmp_count = 0
1807         http_count = 0
1808         m3u8_count = 0
1809
1810         srcs = []
1811         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1812         for medium in media:
1813             src = medium.get('src')
1814             if not src or src in srcs:
1815                 continue
1816             srcs.append(src)
1817
1818             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1819             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1820             width = int_or_none(medium.get('width'))
1821             height = int_or_none(medium.get('height'))
1822             proto = medium.get('proto')
1823             ext = medium.get('ext')
1824             src_ext = determine_ext(src)
1825             streamer = medium.get('streamer') or base
1826
1827             if proto == 'rtmp' or streamer.startswith('rtmp'):
1828                 rtmp_count += 1
1829                 formats.append({
1830                     'url': streamer,
1831                     'play_path': src,
1832                     'ext': 'flv',
1833                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1834                     'tbr': bitrate,
1835                     'filesize': filesize,
1836                     'width': width,
1837                     'height': height,
1838                 })
1839                 if transform_rtmp_url:
1840                     streamer, src = transform_rtmp_url(streamer, src)
1841                     formats[-1].update({
1842                         'url': streamer,
1843                         'play_path': src,
1844                     })
1845                 continue
1846
1847             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1848             src_url = src_url.strip()
1849
1850             if proto == 'm3u8' or src_ext == 'm3u8':
1851                 m3u8_formats = self._extract_m3u8_formats(
1852                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1853                 if len(m3u8_formats) == 1:
1854                     m3u8_count += 1
1855                     m3u8_formats[0].update({
1856                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1857                         'tbr': bitrate,
1858                         'width': width,
1859                         'height': height,
1860                     })
1861                 formats.extend(m3u8_formats)
1862                 continue
1863
1864             if src_ext == 'f4m':
1865                 f4m_url = src_url
1866                 if not f4m_params:
1867                     f4m_params = {
1868                         'hdcore': '3.2.0',
1869                         'plugin': 'flowplayer-3.2.0.1',
1870                     }
1871                 f4m_url += '&' if '?' in f4m_url else '?'
1872                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1873                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1874                 continue
1875
1876             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1877                 http_count += 1
1878                 formats.append({
1879                     'url': src_url,
1880                     'ext': ext or src_ext or 'flv',
1881                     'format_id': 'http-%d' % (bitrate or http_count),
1882                     'tbr': bitrate,
1883                     'filesize': filesize,
1884                     'width': width,
1885                     'height': height,
1886                 })
1887                 continue
1888
1889         return formats
1890
1891     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1892         urls = []
1893         subtitles = {}
1894         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1895             src = textstream.get('src')
1896             if not src or src in urls:
1897                 continue
1898             urls.append(src)
1899             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1900             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1901             subtitles.setdefault(lang, []).append({
1902                 'url': src,
1903                 'ext': ext,
1904             })
1905         return subtitles
1906
1907     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1908         xspf = self._download_xml(
1909             xspf_url, playlist_id, 'Downloading xpsf playlist',
1910             'Unable to download xspf manifest', fatal=fatal)
1911         if xspf is False:
1912             return []
1913         return self._parse_xspf(
1914             xspf, playlist_id, xspf_url=xspf_url,
1915             xspf_base_url=base_url(xspf_url))
1916
1917     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1918         NS_MAP = {
1919             'xspf': 'http://xspf.org/ns/0/',
1920             's1': 'http://static.streamone.nl/player/ns/0',
1921         }
1922
1923         entries = []
1924         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1925             title = xpath_text(
1926                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1927             description = xpath_text(
1928                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1929             thumbnail = xpath_text(
1930                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1931             duration = float_or_none(
1932                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1933
1934             formats = []
1935             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1936                 format_url = urljoin(xspf_base_url, location.text)
1937                 if not format_url:
1938                     continue
1939                 formats.append({
1940                     'url': format_url,
1941                     'manifest_url': xspf_url,
1942                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1943                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1944                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1945                 })
1946             self._sort_formats(formats)
1947
1948             entries.append({
1949                 'id': playlist_id,
1950                 'title': title,
1951                 'description': description,
1952                 'thumbnail': thumbnail,
1953                 'duration': duration,
1954                 'formats': formats,
1955             })
1956         return entries
1957
1958     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1959         res = self._download_xml_handle(
1960             mpd_url, video_id,
1961             note=note or 'Downloading MPD manifest',
1962             errnote=errnote or 'Failed to download MPD manifest',
1963             fatal=fatal)
1964         if res is False:
1965             return []
1966         mpd_doc, urlh = res
1967         mpd_base_url = base_url(urlh.geturl())
1968
1969         return self._parse_mpd_formats(
1970             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1971             formats_dict=formats_dict, mpd_url=mpd_url)
1972
1973     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1974         """
1975         Parse formats from MPD manifest.
1976         References:
1977          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1978             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1979          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1980         """
1981         if mpd_doc.get('type') == 'dynamic':
1982             return []
1983
1984         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1985
1986         def _add_ns(path):
1987             return self._xpath_ns(path, namespace)
1988
1989         def is_drm_protected(element):
1990             return element.find(_add_ns('ContentProtection')) is not None
1991
1992         def extract_multisegment_info(element, ms_parent_info):
1993             ms_info = ms_parent_info.copy()
1994
1995             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1996             # common attributes and elements.  We will only extract relevant
1997             # for us.
1998             def extract_common(source):
1999                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2000                 if segment_timeline is not None:
2001                     s_e = segment_timeline.findall(_add_ns('S'))
2002                     if s_e:
2003                         ms_info['total_number'] = 0
2004                         ms_info['s'] = []
2005                         for s in s_e:
2006                             r = int(s.get('r', 0))
2007                             ms_info['total_number'] += 1 + r
2008                             ms_info['s'].append({
2009                                 't': int(s.get('t', 0)),
2010                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2011                                 'd': int(s.attrib['d']),
2012                                 'r': r,
2013                             })
2014                 start_number = source.get('startNumber')
2015                 if start_number:
2016                     ms_info['start_number'] = int(start_number)
2017                 timescale = source.get('timescale')
2018                 if timescale:
2019                     ms_info['timescale'] = int(timescale)
2020                 segment_duration = source.get('duration')
2021                 if segment_duration:
2022                     ms_info['segment_duration'] = float(segment_duration)
2023
2024             def extract_Initialization(source):
2025                 initialization = source.find(_add_ns('Initialization'))
2026                 if initialization is not None:
2027                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2028
2029             segment_list = element.find(_add_ns('SegmentList'))
2030             if segment_list is not None:
2031                 extract_common(segment_list)
2032                 extract_Initialization(segment_list)
2033                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2034                 if segment_urls_e:
2035                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2036             else:
2037                 segment_template = element.find(_add_ns('SegmentTemplate'))
2038                 if segment_template is not None:
2039                     extract_common(segment_template)
2040                     media = segment_template.get('media')
2041                     if media:
2042                         ms_info['media'] = media
2043                     initialization = segment_template.get('initialization')
2044                     if initialization:
2045                         ms_info['initialization'] = initialization
2046                     else:
2047                         extract_Initialization(segment_template)
2048             return ms_info
2049
2050         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2051         formats = []
2052         for period in mpd_doc.findall(_add_ns('Period')):
2053             period_duration = parse_duration(period.get('duration')) or mpd_duration
2054             period_ms_info = extract_multisegment_info(period, {
2055                 'start_number': 1,
2056                 'timescale': 1,
2057             })
2058             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2059                 if is_drm_protected(adaptation_set):
2060                     continue
2061                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2062                 for representation in adaptation_set.findall(_add_ns('Representation')):
2063                     if is_drm_protected(representation):
2064                         continue
2065                     representation_attrib = adaptation_set.attrib.copy()
2066                     representation_attrib.update(representation.attrib)
2067                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2068                     mime_type = representation_attrib['mimeType']
2069                     content_type = mime_type.split('/')[0]
2070                     if content_type == 'text':
2071                         # TODO implement WebVTT downloading
2072                         pass
2073                     elif content_type in ('video', 'audio'):
2074                         base_url = ''
2075                         for element in (representation, adaptation_set, period, mpd_doc):
2076                             base_url_e = element.find(_add_ns('BaseURL'))
2077                             if base_url_e is not None:
2078                                 base_url = base_url_e.text + base_url
2079                                 if re.match(r'^https?://', base_url):
2080                                     break
2081                         if mpd_base_url and not re.match(r'^https?://', base_url):
2082                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2083                                 mpd_base_url += '/'
2084                             base_url = mpd_base_url + base_url
2085                         representation_id = representation_attrib.get('id')
2086                         lang = representation_attrib.get('lang')
2087                         url_el = representation.find(_add_ns('BaseURL'))
2088                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2089                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2090                         f = {
2091                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2092                             'url': base_url,
2093                             'manifest_url': mpd_url,
2094                             'ext': mimetype2ext(mime_type),
2095                             'width': int_or_none(representation_attrib.get('width')),
2096                             'height': int_or_none(representation_attrib.get('height')),
2097                             'tbr': float_or_none(bandwidth, 1000),
2098                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2099                             'fps': int_or_none(representation_attrib.get('frameRate')),
2100                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2101                             'format_note': 'DASH %s' % content_type,
2102                             'filesize': filesize,
2103                             'container': mimetype2ext(mime_type) + '_dash',
2104                         }
2105                         f.update(parse_codecs(representation_attrib.get('codecs')))
2106                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2107
2108                         def prepare_template(template_name, identifiers):
2109                             tmpl = representation_ms_info[template_name]
2110                             # First of, % characters outside $...$ templates
2111                             # must be escaped by doubling for proper processing
2112                             # by % operator string formatting used further (see
2113                             # https://github.com/rg3/youtube-dl/issues/16867).
2114                             t = ''
2115                             in_template = False
2116                             for c in tmpl:
2117                                 t += c
2118                                 if c == '$':
2119                                     in_template = not in_template
2120                                 elif c == '%' and not in_template:
2121                                     t += c
2122                             # Next, $...$ templates are translated to their
2123                             # %(...) counterparts to be used with % operator
2124                             t = t.replace('$RepresentationID$', representation_id)
2125                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2126                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2127                             t.replace('$$', '$')
2128                             return t
2129
2130                         # @initialization is a regular template like @media one
2131                         # so it should be handled just the same way (see
2132                         # https://github.com/rg3/youtube-dl/issues/11605)
2133                         if 'initialization' in representation_ms_info:
2134                             initialization_template = prepare_template(
2135                                 'initialization',
2136                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2137                                 # $Time$ shall not be included for @initialization thus
2138                                 # only $Bandwidth$ remains
2139                                 ('Bandwidth', ))
2140                             representation_ms_info['initialization_url'] = initialization_template % {
2141                                 'Bandwidth': bandwidth,
2142                             }
2143
2144                         def location_key(location):
2145                             return 'url' if re.match(r'^https?://', location) else 'path'
2146
2147                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2148
2149                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2150                             media_location_key = location_key(media_template)
2151
2152                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2153                             # can't be used at the same time
2154                             if '%(Number' in media_template and 's' not in representation_ms_info:
2155                                 segment_duration = None
2156                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2157                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2158                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2159                                 representation_ms_info['fragments'] = [{
2160                                     media_location_key: media_template % {
2161                                         'Number': segment_number,
2162                                         'Bandwidth': bandwidth,
2163                                     },
2164                                     'duration': segment_duration,
2165                                 } for segment_number in range(
2166                                     representation_ms_info['start_number'],
2167                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2168                             else:
2169                                 # $Number*$ or $Time$ in media template with S list available
2170                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2171                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2172                                 representation_ms_info['fragments'] = []
2173                                 segment_time = 0
2174                                 segment_d = None
2175                                 segment_number = representation_ms_info['start_number']
2176
2177                                 def add_segment_url():
2178                                     segment_url = media_template % {
2179                                         'Time': segment_time,
2180                                         'Bandwidth': bandwidth,
2181                                         'Number': segment_number,
2182                                     }
2183                                     representation_ms_info['fragments'].append({
2184                                         media_location_key: segment_url,
2185                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2186                                     })
2187
2188                                 for num, s in enumerate(representation_ms_info['s']):
2189                                     segment_time = s.get('t') or segment_time
2190                                     segment_d = s['d']
2191                                     add_segment_url()
2192                                     segment_number += 1
2193                                     for r in range(s.get('r', 0)):
2194                                         segment_time += segment_d
2195                                         add_segment_url()
2196                                         segment_number += 1
2197                                     segment_time += segment_d
2198                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2199                             # No media template
2200                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2201                             # or any YouTube dashsegments video
2202                             fragments = []
2203                             segment_index = 0
2204                             timescale = representation_ms_info['timescale']
2205                             for s in representation_ms_info['s']:
2206                                 duration = float_or_none(s['d'], timescale)
2207                                 for r in range(s.get('r', 0) + 1):
2208                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2209                                     fragments.append({
2210                                         location_key(segment_uri): segment_uri,
2211                                         'duration': duration,
2212                                     })
2213                                     segment_index += 1
2214                             representation_ms_info['fragments'] = fragments
2215                         elif 'segment_urls' in representation_ms_info:
2216                             # Segment URLs with no SegmentTimeline
2217                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2218                             # https://github.com/rg3/youtube-dl/pull/14844
2219                             fragments = []
2220                             segment_duration = float_or_none(
2221                                 representation_ms_info['segment_duration'],
2222                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2223                             for segment_url in representation_ms_info['segment_urls']:
2224                                 fragment = {
2225                                     location_key(segment_url): segment_url,
2226                                 }
2227                                 if segment_duration:
2228                                     fragment['duration'] = segment_duration
2229                                 fragments.append(fragment)
2230                             representation_ms_info['fragments'] = fragments
2231                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2232                         # No fragments key is present in this case.
2233                         if 'fragments' in representation_ms_info:
2234                             f.update({
2235                                 'fragment_base_url': base_url,
2236                                 'fragments': [],
2237                                 'protocol': 'http_dash_segments',
2238                             })
2239                             if 'initialization_url' in representation_ms_info:
2240                                 initialization_url = representation_ms_info['initialization_url']
2241                                 if not f.get('url'):
2242                                     f['url'] = initialization_url
2243                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2244                             f['fragments'].extend(representation_ms_info['fragments'])
2245                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2246                         # is not necessarily unique within a Period thus formats with
2247                         # the same `format_id` are quite possible. There are numerous examples
2248                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2249                         # https://github.com/rg3/youtube-dl/issues/13919)
2250                         full_info = formats_dict.get(representation_id, {}).copy()
2251                         full_info.update(f)
2252                         formats.append(full_info)
2253                     else:
2254                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2255         return formats
2256
2257     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2258         res = self._download_xml_handle(
2259             ism_url, video_id,
2260             note=note or 'Downloading ISM manifest',
2261             errnote=errnote or 'Failed to download ISM manifest',
2262             fatal=fatal)
2263         if res is False:
2264             return []
2265         ism_doc, urlh = res
2266
2267         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2268
2269     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2270         """
2271         Parse formats from ISM manifest.
2272         References:
2273          1. [MS-SSTR]: Smooth Streaming Protocol,
2274             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2275         """
2276         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2277             return []
2278
2279         duration = int(ism_doc.attrib['Duration'])
2280         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2281
2282         formats = []
2283         for stream in ism_doc.findall('StreamIndex'):
2284             stream_type = stream.get('Type')
2285             if stream_type not in ('video', 'audio'):
2286                 continue
2287             url_pattern = stream.attrib['Url']
2288             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2289             stream_name = stream.get('Name')
2290             for track in stream.findall('QualityLevel'):
2291                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2292                 # TODO: add support for WVC1 and WMAP
2293                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2294                     self.report_warning('%s is not a supported codec' % fourcc)
2295                     continue
2296                 tbr = int(track.attrib['Bitrate']) // 1000
2297                 # [1] does not mention Width and Height attributes. However,
2298                 # they're often present while MaxWidth and MaxHeight are
2299                 # missing, so should be used as fallbacks
2300                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2301                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2302                 sampling_rate = int_or_none(track.get('SamplingRate'))
2303
2304                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2305                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2306
2307                 fragments = []
2308                 fragment_ctx = {
2309                     'time': 0,
2310                 }
2311                 stream_fragments = stream.findall('c')
2312                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2313                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2314                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2315                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2316                     if not fragment_ctx['duration']:
2317                         try:
2318                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2319                         except IndexError:
2320                             next_fragment_time = duration
2321                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2322                     for _ in range(fragment_repeat):
2323                         fragments.append({
2324                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2325                             'duration': fragment_ctx['duration'] / stream_timescale,
2326                         })
2327                         fragment_ctx['time'] += fragment_ctx['duration']
2328
2329                 format_id = []
2330                 if ism_id:
2331                     format_id.append(ism_id)
2332                 if stream_name:
2333                     format_id.append(stream_name)
2334                 format_id.append(compat_str(tbr))
2335
2336                 formats.append({
2337                     'format_id': '-'.join(format_id),
2338                     'url': ism_url,
2339                     'manifest_url': ism_url,
2340                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2341                     'width': width,
2342                     'height': height,
2343                     'tbr': tbr,
2344                     'asr': sampling_rate,
2345                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2346                     'acodec': 'none' if stream_type == 'video' else fourcc,
2347                     'protocol': 'ism',
2348                     'fragments': fragments,
2349                     '_download_params': {
2350                         'duration': duration,
2351                         'timescale': stream_timescale,
2352                         'width': width or 0,
2353                         'height': height or 0,
2354                         'fourcc': fourcc,
2355                         'codec_private_data': track.get('CodecPrivateData'),
2356                         'sampling_rate': sampling_rate,
2357                         'channels': int_or_none(track.get('Channels', 2)),
2358                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2359                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2360                     },
2361                 })
2362         return formats
2363
2364     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2365         def absolute_url(item_url):
2366             return urljoin(base_url, item_url)
2367
2368         def parse_content_type(content_type):
2369             if not content_type:
2370                 return {}
2371             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2372             if ctr:
2373                 mimetype, codecs = ctr.groups()
2374                 f = parse_codecs(codecs)
2375                 f['ext'] = mimetype2ext(mimetype)
2376                 return f
2377             return {}
2378
2379         def _media_formats(src, cur_media_type, type_info={}):
2380             full_url = absolute_url(src)
2381             ext = type_info.get('ext') or determine_ext(full_url)
2382             if ext == 'm3u8':
2383                 is_plain_url = False
2384                 formats = self._extract_m3u8_formats(
2385                     full_url, video_id, ext='mp4',
2386                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2387                     preference=preference, fatal=False)
2388             elif ext == 'mpd':
2389                 is_plain_url = False
2390                 formats = self._extract_mpd_formats(
2391                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2392             else:
2393                 is_plain_url = True
2394                 formats = [{
2395                     'url': full_url,
2396                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2397                 }]
2398             return is_plain_url, formats
2399
2400         entries = []
2401         # amp-video and amp-audio are very similar to their HTML5 counterparts
2402         # so we wll include them right here (see
2403         # https://www.ampproject.org/docs/reference/components/amp-video)
2404         media_tags = [(media_tag, media_type, '')
2405                       for media_tag, media_type
2406                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2407         media_tags.extend(re.findall(
2408             # We only allow video|audio followed by a whitespace or '>'.
2409             # Allowing more characters may end up in significant slow down (see
2410             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2411             # http://www.porntrex.com/maps/videositemap.xml).
2412             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2413         for media_tag, media_type, media_content in media_tags:
2414             media_info = {
2415                 'formats': [],
2416                 'subtitles': {},
2417             }
2418             media_attributes = extract_attributes(media_tag)
2419             src = media_attributes.get('src')
2420             if src:
2421                 _, formats = _media_formats(src, media_type)
2422                 media_info['formats'].extend(formats)
2423             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2424             if media_content:
2425                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2426                     source_attributes = extract_attributes(source_tag)
2427                     src = source_attributes.get('src')
2428                     if not src:
2429                         continue
2430                     f = parse_content_type(source_attributes.get('type'))
2431                     is_plain_url, formats = _media_formats(src, media_type, f)
2432                     if is_plain_url:
2433                         # res attribute is not standard but seen several times
2434                         # in the wild
2435                         f.update({
2436                             'height': int_or_none(source_attributes.get('res')),
2437                             'format_id': source_attributes.get('label'),
2438                         })
2439                         f.update(formats[0])
2440                         media_info['formats'].append(f)
2441                     else:
2442                         media_info['formats'].extend(formats)
2443                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2444                     track_attributes = extract_attributes(track_tag)
2445                     kind = track_attributes.get('kind')
2446                     if not kind or kind in ('subtitles', 'captions'):
2447                         src = track_attributes.get('src')
2448                         if not src:
2449                             continue
2450                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2451                         media_info['subtitles'].setdefault(lang, []).append({
2452                             'url': absolute_url(src),
2453                         })
2454             for f in media_info['formats']:
2455                 f.setdefault('http_headers', {})['Referer'] = base_url
2456             if media_info['formats'] or media_info['subtitles']:
2457                 entries.append(media_info)
2458         return entries
2459
2460     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2461         formats = []
2462         hdcore_sign = 'hdcore=3.7.0'
2463         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2464         hds_host = hosts.get('hds')
2465         if hds_host:
2466             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2467         if 'hdcore=' not in f4m_url:
2468             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2469         f4m_formats = self._extract_f4m_formats(
2470             f4m_url, video_id, f4m_id='hds', fatal=False)
2471         for entry in f4m_formats:
2472             entry.update({'extra_param_to_segment_url': hdcore_sign})
2473         formats.extend(f4m_formats)
2474         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2475         hls_host = hosts.get('hls')
2476         if hls_host:
2477             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2478         formats.extend(self._extract_m3u8_formats(
2479             m3u8_url, video_id, 'mp4', 'm3u8_native',
2480             m3u8_id='hls', fatal=False))
2481         return formats
2482
2483     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2484         query = compat_urlparse.urlparse(url).query
2485         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2486         mobj = re.search(
2487             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2488         url_base = mobj.group('url')
2489         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2490         formats = []
2491
2492         def manifest_url(manifest):
2493             m_url = '%s/%s' % (http_base_url, manifest)
2494             if query:
2495                 m_url += '?%s' % query
2496             return m_url
2497
2498         if 'm3u8' not in skip_protocols:
2499             formats.extend(self._extract_m3u8_formats(
2500                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2501                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2502         if 'f4m' not in skip_protocols:
2503             formats.extend(self._extract_f4m_formats(
2504                 manifest_url('manifest.f4m'),
2505                 video_id, f4m_id='hds', fatal=False))
2506         if 'dash' not in skip_protocols:
2507             formats.extend(self._extract_mpd_formats(
2508                 manifest_url('manifest.mpd'),
2509                 video_id, mpd_id='dash', fatal=False))
2510         if re.search(r'(?:/smil:|\.smil)', url_base):
2511             if 'smil' not in skip_protocols:
2512                 rtmp_formats = self._extract_smil_formats(
2513                     manifest_url('jwplayer.smil'),
2514                     video_id, fatal=False)
2515                 for rtmp_format in rtmp_formats:
2516                     rtsp_format = rtmp_format.copy()
2517                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2518                     del rtsp_format['play_path']
2519                     del rtsp_format['ext']
2520                     rtsp_format.update({
2521                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2522                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2523                         'protocol': 'rtsp',
2524                     })
2525                     formats.extend([rtmp_format, rtsp_format])
2526         else:
2527             for protocol in ('rtmp', 'rtsp'):
2528                 if protocol not in skip_protocols:
2529                     formats.append({
2530                         'url': '%s:%s' % (protocol, url_base),
2531                         'format_id': protocol,
2532                         'protocol': protocol,
2533                     })
2534         return formats
2535
2536     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2537         mobj = re.search(
2538             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2539             webpage)
2540         if mobj:
2541             try:
2542                 jwplayer_data = self._parse_json(mobj.group('options'),
2543                                                  video_id=video_id,
2544                                                  transform_source=transform_source)
2545             except ExtractorError:
2546                 pass
2547             else:
2548                 if isinstance(jwplayer_data, dict):
2549                     return jwplayer_data
2550
2551     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2552         jwplayer_data = self._find_jwplayer_data(
2553             webpage, video_id, transform_source=js_to_json)
2554         return self._parse_jwplayer_data(
2555             jwplayer_data, video_id, *args, **kwargs)
2556
2557     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2558                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2559         # JWPlayer backward compatibility: flattened playlists
2560         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2561         if 'playlist' not in jwplayer_data:
2562             jwplayer_data = {'playlist': [jwplayer_data]}
2563
2564         entries = []
2565
2566         # JWPlayer backward compatibility: single playlist item
2567         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2568         if not isinstance(jwplayer_data['playlist'], list):
2569             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2570
2571         for video_data in jwplayer_data['playlist']:
2572             # JWPlayer backward compatibility: flattened sources
2573             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2574             if 'sources' not in video_data:
2575                 video_data['sources'] = [video_data]
2576
2577             this_video_id = video_id or video_data['mediaid']
2578
2579             formats = self._parse_jwplayer_formats(
2580                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2581                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2582
2583             subtitles = {}
2584             tracks = video_data.get('tracks')
2585             if tracks and isinstance(tracks, list):
2586                 for track in tracks:
2587                     if not isinstance(track, dict):
2588                         continue
2589                     track_kind = track.get('kind')
2590                     if not track_kind or not isinstance(track_kind, compat_str):
2591                         continue
2592                     if track_kind.lower() not in ('captions', 'subtitles'):
2593                         continue
2594                     track_url = urljoin(base_url, track.get('file'))
2595                     if not track_url:
2596                         continue
2597                     subtitles.setdefault(track.get('label') or 'en', []).append({
2598                         'url': self._proto_relative_url(track_url)
2599                     })
2600
2601             entry = {
2602                 'id': this_video_id,
2603                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2604                 'description': video_data.get('description'),
2605                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2606                 'timestamp': int_or_none(video_data.get('pubdate')),
2607                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2608                 'subtitles': subtitles,
2609             }
2610             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2611             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2612                 entry.update({
2613                     '_type': 'url_transparent',
2614                     'url': formats[0]['url'],
2615                 })
2616             else:
2617                 self._sort_formats(formats)
2618                 entry['formats'] = formats
2619             entries.append(entry)
2620         if len(entries) == 1:
2621             return entries[0]
2622         else:
2623             return self.playlist_result(entries)
2624
2625     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2626                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2627         urls = []
2628         formats = []
2629         for source in jwplayer_sources_data:
2630             if not isinstance(source, dict):
2631                 continue
2632             source_url = self._proto_relative_url(source.get('file'))
2633             if not source_url:
2634                 continue
2635             if base_url:
2636                 source_url = compat_urlparse.urljoin(base_url, source_url)
2637             if source_url in urls:
2638                 continue
2639             urls.append(source_url)
2640             source_type = source.get('type') or ''
2641             ext = mimetype2ext(source_type) or determine_ext(source_url)
2642             if source_type == 'hls' or ext == 'm3u8':
2643                 formats.extend(self._extract_m3u8_formats(
2644                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2645                     m3u8_id=m3u8_id, fatal=False))
2646             elif source_type == 'dash' or ext == 'mpd':
2647                 formats.extend(self._extract_mpd_formats(
2648                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2649             elif ext == 'smil':
2650                 formats.extend(self._extract_smil_formats(
2651                     source_url, video_id, fatal=False))
2652             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2653             elif source_type.startswith('audio') or ext in (
2654                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2655                 formats.append({
2656                     'url': source_url,
2657                     'vcodec': 'none',
2658                     'ext': ext,
2659                 })
2660             else:
2661                 height = int_or_none(source.get('height'))
2662                 if height is None:
2663                     # Often no height is provided but there is a label in
2664                     # format like "1080p", "720p SD", or 1080.
2665                     height = int_or_none(self._search_regex(
2666                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2667                         'height', default=None))
2668                 a_format = {
2669                     'url': source_url,
2670                     'width': int_or_none(source.get('width')),
2671                     'height': height,
2672                     'tbr': int_or_none(source.get('bitrate')),
2673                     'ext': ext,
2674                 }
2675                 if source_url.startswith('rtmp'):
2676                     a_format['ext'] = 'flv'
2677                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2678                     # of jwplayer.flash.swf
2679                     rtmp_url_parts = re.split(
2680                         r'((?:mp4|mp3|flv):)', source_url, 1)
2681                     if len(rtmp_url_parts) == 3:
2682                         rtmp_url, prefix, play_path = rtmp_url_parts
2683                         a_format.update({
2684                             'url': rtmp_url,
2685                             'play_path': prefix + play_path,
2686                         })
2687                     if rtmp_params:
2688                         a_format.update(rtmp_params)
2689                 formats.append(a_format)
2690         return formats
2691
2692     def _live_title(self, name):
2693         """ Generate the title for a live video """
2694         now = datetime.datetime.now()
2695         now_str = now.strftime('%Y-%m-%d %H:%M')
2696         return name + ' ' + now_str
2697
2698     def _int(self, v, name, fatal=False, **kwargs):
2699         res = int_or_none(v, **kwargs)
2700         if 'get_attr' in kwargs:
2701             print(getattr(v, kwargs['get_attr']))
2702         if res is None:
2703             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2704             if fatal:
2705                 raise ExtractorError(msg)
2706             else:
2707                 self._downloader.report_warning(msg)
2708         return res
2709
2710     def _float(self, v, name, fatal=False, **kwargs):
2711         res = float_or_none(v, **kwargs)
2712         if res is None:
2713             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2714             if fatal:
2715                 raise ExtractorError(msg)
2716             else:
2717                 self._downloader.report_warning(msg)
2718         return res
2719
2720     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2721                     path='/', secure=False, discard=False, rest={}, **kwargs):
2722         cookie = compat_cookiejar.Cookie(
2723             0, name, value, port, port is not None, domain, True,
2724             domain.startswith('.'), path, True, secure, expire_time,
2725             discard, None, None, rest)
2726         self._downloader.cookiejar.set_cookie(cookie)
2727
2728     def _get_cookies(self, url):
2729         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2730         req = sanitized_Request(url)
2731         self._downloader.cookiejar.add_cookie_header(req)
2732         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2733
2734     def get_testcases(self, include_onlymatching=False):
2735         t = getattr(self, '_TEST', None)
2736         if t:
2737             assert not hasattr(self, '_TESTS'), \
2738                 '%s has _TEST and _TESTS' % type(self).__name__
2739             tests = [t]
2740         else:
2741             tests = getattr(self, '_TESTS', [])
2742         for t in tests:
2743             if not include_onlymatching and t.get('only_matching', False):
2744                 continue
2745             t['name'] = type(self).__name__[:-len('IE')]
2746             yield t
2747
2748     def is_suitable(self, age_limit):
2749         """ Test whether the extractor is generally suitable for the given
2750         age limit (i.e. pornographic sites are not, all others usually are) """
2751
2752         any_restricted = False
2753         for tc in self.get_testcases(include_onlymatching=False):
2754             if tc.get('playlist', []):
2755                 tc = tc['playlist'][0]
2756             is_restricted = age_restricted(
2757                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2758             if not is_restricted:
2759                 return True
2760             any_restricted = any_restricted or is_restricted
2761         return not any_restricted
2762
2763     def extract_subtitles(self, *args, **kwargs):
2764         if (self._downloader.params.get('writesubtitles', False) or
2765                 self._downloader.params.get('listsubtitles')):
2766             return self._get_subtitles(*args, **kwargs)
2767         return {}
2768
2769     def _get_subtitles(self, *args, **kwargs):
2770         raise NotImplementedError('This method must be implemented by subclasses')
2771
2772     @staticmethod
2773     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2774         """ Merge subtitle items for one language. Items with duplicated URLs
2775         will be dropped. """
2776         list1_urls = set([item['url'] for item in subtitle_list1])
2777         ret = list(subtitle_list1)
2778         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2779         return ret
2780
2781     @classmethod
2782     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2783         """ Merge two subtitle dictionaries, language by language. """
2784         ret = dict(subtitle_dict1)
2785         for lang in subtitle_dict2:
2786             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2787         return ret
2788
2789     def extract_automatic_captions(self, *args, **kwargs):
2790         if (self._downloader.params.get('writeautomaticsub', False) or
2791                 self._downloader.params.get('listsubtitles')):
2792             return self._get_automatic_captions(*args, **kwargs)
2793         return {}
2794
2795     def _get_automatic_captions(self, *args, **kwargs):
2796         raise NotImplementedError('This method must be implemented by subclasses')
2797
2798     def mark_watched(self, *args, **kwargs):
2799         if (self._downloader.params.get('mark_watched', False) and
2800                 (self._get_login_info()[0] is not None or
2801                     self._downloader.params.get('cookiefile') is not None)):
2802             self._mark_watched(*args, **kwargs)
2803
2804     def _mark_watched(self, *args, **kwargs):
2805         raise NotImplementedError('This method must be implemented by subclasses')
2806
2807     def geo_verification_headers(self):
2808         headers = {}
2809         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2810         if geo_verification_proxy:
2811             headers['Ytdl-request-proxy'] = geo_verification_proxy
2812         return headers
2813
2814     def _generic_id(self, url):
2815         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2816
2817     def _generic_title(self, url):
2818         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2819
2820
2821 class SearchInfoExtractor(InfoExtractor):
2822     """
2823     Base class for paged search queries extractors.
2824     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2825     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2826     """
2827
2828     @classmethod
2829     def _make_valid_url(cls):
2830         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2831
2832     @classmethod
2833     def suitable(cls, url):
2834         return re.match(cls._make_valid_url(), url) is not None
2835
2836     def _real_extract(self, query):
2837         mobj = re.match(self._make_valid_url(), query)
2838         if mobj is None:
2839             raise ExtractorError('Invalid search query "%s"' % query)
2840
2841         prefix = mobj.group('prefix')
2842         query = mobj.group('query')
2843         if prefix == '':
2844             return self._get_n_results(query, 1)
2845         elif prefix == 'all':
2846             return self._get_n_results(query, self._MAX_RESULTS)
2847         else:
2848             n = int(prefix)
2849             if n <= 0:
2850                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2851             elif n > self._MAX_RESULTS:
2852                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2853                 n = self._MAX_RESULTS
2854             return self._get_n_results(query, n)
2855
2856     def _get_n_results(self, query, n):
2857         """Get a specified number of results for a query"""
2858         raise NotImplementedError('This method must be implemented by subclasses')
2859
2860     @property
2861     def SEARCH_KEY(self):
2862         return self._SEARCH_KEY