8452125c8802b1398698c334084c85b673e80964
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     url_or_none,
73     xpath_element,
74     xpath_text,
75     xpath_with_ns,
76 )
77
78
79 class InfoExtractor(object):
80     """Information Extractor class.
81
82     Information extractors are the classes that, given a URL, extract
83     information about the video (or videos) the URL refers to. This
84     information includes the real video URL, the video title, author and
85     others. The information is stored in a dictionary which is then
86     passed to the YoutubeDL. The YoutubeDL processes this
87     information possibly downloading the video to the file system, among
88     other possible outcomes.
89
90     The type field determines the type of the result.
91     By far the most common value (and the default if _type is missing) is
92     "video", which indicates a single video.
93
94     For a video, the dictionaries must include the following fields:
95
96     id:             Video identifier.
97     title:          Video title, unescaped.
98
99     Additionally, it must contain either a formats entry or a url one:
100
101     formats:        A list of dictionaries for each format available, ordered
102                     from worst to best quality.
103
104                     Potential fields:
105                     * url        Mandatory. The URL of the video file
106                     * manifest_url
107                                  The URL of the manifest file in case of
108                                  fragmented media (DASH, hls, hds)
109                     * ext        Will be calculated from URL if missing
110                     * format     A human-readable description of the format
111                                  ("mp4 container with h264/opus").
112                                  Calculated from the format_id, width, height.
113                                  and format_note fields if missing.
114                     * format_id  A short description of the format
115                                  ("mp4_h264_opus" or "19").
116                                 Technically optional, but strongly recommended.
117                     * format_note Additional info about the format
118                                  ("3D" or "DASH video")
119                     * width      Width of the video, if known
120                     * height     Height of the video, if known
121                     * resolution Textual description of width and height
122                     * tbr        Average bitrate of audio and video in KBit/s
123                     * abr        Average audio bitrate in KBit/s
124                     * acodec     Name of the audio codec in use
125                     * asr        Audio sampling rate in Hertz
126                     * vbr        Average video bitrate in KBit/s
127                     * fps        Frame rate
128                     * vcodec     Name of the video codec in use
129                     * container  Name of the container format
130                     * filesize   The number of bytes, if known in advance
131                     * filesize_approx  An estimate for the number of bytes
132                     * player_url SWF Player URL (used for rtmpdump).
133                     * protocol   The protocol that will be used for the actual
134                                  download, lower-case.
135                                  "http", "https", "rtsp", "rtmp", "rtmpe",
136                                  "m3u8", "m3u8_native" or "http_dash_segments".
137                     * fragment_base_url
138                                  Base URL for fragments. Each fragment's path
139                                  value (if present) will be relative to
140                                  this URL.
141                     * fragments  A list of fragments of a fragmented media.
142                                  Each fragment entry must contain either an url
143                                  or a path. If an url is present it should be
144                                  considered by a client. Otherwise both path and
145                                  fragment_base_url must be present. Here is
146                                  the list of all potential fields:
147                                  * "url" - fragment's URL
148                                  * "path" - fragment's path relative to
149                                             fragment_base_url
150                                  * "duration" (optional, int or float)
151                                  * "filesize" (optional, int)
152                     * preference Order number of this format. If this field is
153                                  present and not None, the formats get sorted
154                                  by this field, regardless of all other values.
155                                  -1 for default (order by other properties),
156                                  -2 or smaller for less than default.
157                                  < -1000 to hide the format (if there is
158                                     another one which is strictly better)
159                     * language   Language code, e.g. "de" or "en-US".
160                     * language_preference  Is this in the language mentioned in
161                                  the URL?
162                                  10 if it's what the URL is about,
163                                  -1 for default (don't know),
164                                  -10 otherwise, other values reserved for now.
165                     * quality    Order number of the video quality of this
166                                  format, irrespective of the file format.
167                                  -1 for default (order by other properties),
168                                  -2 or smaller for less than default.
169                     * source_preference  Order number for this video source
170                                   (quality takes higher priority)
171                                  -1 for default (order by other properties),
172                                  -2 or smaller for less than default.
173                     * http_headers  A dictionary of additional HTTP headers
174                                  to add to the request.
175                     * stretched_ratio  If given and not 1, indicates that the
176                                  video's pixels are not square.
177                                  width : height ratio as float.
178                     * no_resume  The server does not support resuming the
179                                  (HTTP or RTMP) download. Boolean.
180                     * downloader_options  A dictionary of downloader options as
181                                  described in FileDownloader
182
183     url:            Final video URL.
184     ext:            Video filename extension.
185     format:         The video format, defaults to ext (used for --get-format)
186     player_url:     SWF Player URL (used for rtmpdump).
187
188     The following fields are optional:
189
190     alt_title:      A secondary title of the video.
191     display_id      An alternative identifier for the video, not necessarily
192                     unique, but available before title. Typically, id is
193                     something like "4234987", title "Dancing naked mole rats",
194                     and display_id "dancing-naked-mole-rats"
195     thumbnails:     A list of dictionaries, with the following entries:
196                         * "id" (optional, string) - Thumbnail format ID
197                         * "url"
198                         * "preference" (optional, int) - quality of the image
199                         * "width" (optional, int)
200                         * "height" (optional, int)
201                         * "resolution" (optional, string "{width}x{height"},
202                                         deprecated)
203                         * "filesize" (optional, int)
204     thumbnail:      Full URL to a video thumbnail image.
205     description:    Full video description.
206     uploader:       Full name of the video uploader.
207     license:        License name the video is licensed under.
208     creator:        The creator of the video.
209     release_date:   The date (YYYYMMDD) when the video was released.
210     timestamp:      UNIX timestamp of the moment the video became available.
211     upload_date:    Video upload date (YYYYMMDD).
212                     If not explicitly set, calculated from timestamp.
213     uploader_id:    Nickname or id of the video uploader.
214     uploader_url:   Full URL to a personal webpage of the video uploader.
215     channel:        Full name of the channel the video is uploaded on.
216                     Note that channel fields may or may not repeat uploader
217                     fields. This depends on a particular extractor.
218     channel_id:     Id of the channel.
219     channel_url:    Full URL to a channel webpage.
220     location:       Physical location where the video was filmed.
221     subtitles:      The available subtitles as a dictionary in the format
222                     {tag: subformats}. "tag" is usually a language code, and
223                     "subformats" is a list sorted from lower to higher
224                     preference, each element is a dictionary with the "ext"
225                     entry and one of:
226                         * "data": The subtitles file contents
227                         * "url": A URL pointing to the subtitles file
228                     "ext" will be calculated from URL if missing
229     automatic_captions: Like 'subtitles', used by the YoutubeIE for
230                     automatically generated captions
231     duration:       Length of the video in seconds, as an integer or float.
232     view_count:     How many users have watched the video on the platform.
233     like_count:     Number of positive ratings of the video
234     dislike_count:  Number of negative ratings of the video
235     repost_count:   Number of reposts of the video
236     average_rating: Average rating give by users, the scale used depends on the webpage
237     comment_count:  Number of comments on the video
238     comments:       A list of comments, each with one or more of the following
239                     properties (all but one of text or html optional):
240                         * "author" - human-readable name of the comment author
241                         * "author_id" - user ID of the comment author
242                         * "id" - Comment ID
243                         * "html" - Comment as HTML
244                         * "text" - Plain text of the comment
245                         * "timestamp" - UNIX timestamp of comment
246                         * "parent" - ID of the comment this one is replying to.
247                                      Set to "root" to indicate that this is a
248                                      comment to the original video.
249     age_limit:      Age restriction for the video, as an integer (years)
250     webpage_url:    The URL to the video webpage, if given to youtube-dl it
251                     should allow to get the same result again. (It will be set
252                     by YoutubeDL if it's missing)
253     categories:     A list of categories that the video falls in, for example
254                     ["Sports", "Berlin"]
255     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
256     is_live:        True, False, or None (=unknown). Whether this video is a
257                     live stream that goes on instead of a fixed-length video.
258     start_time:     Time in seconds where the reproduction should start, as
259                     specified in the URL.
260     end_time:       Time in seconds where the reproduction should end, as
261                     specified in the URL.
262     chapters:       A list of dictionaries, with the following entries:
263                         * "start_time" - The start time of the chapter in seconds
264                         * "end_time" - The end time of the chapter in seconds
265                         * "title" (optional, string)
266
267     The following fields should only be used when the video belongs to some logical
268     chapter or section:
269
270     chapter:        Name or title of the chapter the video belongs to.
271     chapter_number: Number of the chapter the video belongs to, as an integer.
272     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
273
274     The following fields should only be used when the video is an episode of some
275     series, programme or podcast:
276
277     series:         Title of the series or programme the video episode belongs to.
278     season:         Title of the season the video episode belongs to.
279     season_number:  Number of the season the video episode belongs to, as an integer.
280     season_id:      Id of the season the video episode belongs to, as a unicode string.
281     episode:        Title of the video episode. Unlike mandatory video title field,
282                     this field should denote the exact title of the video episode
283                     without any kind of decoration.
284     episode_number: Number of the video episode within a season, as an integer.
285     episode_id:     Id of the video episode, as a unicode string.
286
287     The following fields should only be used when the media is a track or a part of
288     a music album:
289
290     track:          Title of the track.
291     track_number:   Number of the track within an album or a disc, as an integer.
292     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
293                     as a unicode string.
294     artist:         Artist(s) of the track.
295     genre:          Genre(s) of the track.
296     album:          Title of the album the track belongs to.
297     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
298     album_artist:   List of all artists appeared on the album (e.g.
299                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
300                     and compilations).
301     disc_number:    Number of the disc or other physical medium the track belongs to,
302                     as an integer.
303     release_year:   Year (YYYY) when the album was released.
304
305     Unless mentioned otherwise, the fields should be Unicode strings.
306
307     Unless mentioned otherwise, None is equivalent to absence of information.
308
309
310     _type "playlist" indicates multiple videos.
311     There must be a key "entries", which is a list, an iterable, or a PagedList
312     object, each element of which is a valid dictionary by this specification.
313
314     Additionally, playlists can have "id", "title", "description", "uploader",
315     "uploader_id", "uploader_url" attributes with the same semantics as videos
316     (see above).
317
318
319     _type "multi_video" indicates that there are multiple videos that
320     form a single show, for examples multiple acts of an opera or TV episode.
321     It must have an entries key like a playlist and contain all the keys
322     required for a video at the same time.
323
324
325     _type "url" indicates that the video must be extracted from another
326     location, possibly by a different extractor. Its only required key is:
327     "url" - the next URL to extract.
328     The key "ie_key" can be set to the class name (minus the trailing "IE",
329     e.g. "Youtube") if the extractor class is known in advance.
330     Additionally, the dictionary may have any properties of the resolved entity
331     known in advance, for example "title" if the title of the referred video is
332     known ahead of time.
333
334
335     _type "url_transparent" entities have the same specification as "url", but
336     indicate that the given additional information is more precise than the one
337     associated with the resolved URL.
338     This is useful when a site employs a video service that hosts the video and
339     its technical metadata, but that video service does not embed a useful
340     title, description etc.
341
342
343     Subclasses of this one should re-define the _real_initialize() and
344     _real_extract() methods and define a _VALID_URL regexp.
345     Probably, they should also be added to the list of extractors.
346
347     _GEO_BYPASS attribute may be set to False in order to disable
348     geo restriction bypass mechanisms for a particular extractor.
349     Though it won't disable explicit geo restriction bypass based on
350     country code provided with geo_bypass_country.
351
352     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
353     countries for this extractor. One of these countries will be used by
354     geo restriction bypass mechanism right away in order to bypass
355     geo restriction, of course, if the mechanism is not disabled.
356
357     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
358     IP blocks in CIDR notation for this extractor. One of these IP blocks
359     will be used by geo restriction bypass mechanism similarly
360     to _GEO_COUNTRIES.
361
362     Finally, the _WORKING attribute should be set to False for broken IEs
363     in order to warn the users and skip the tests.
364     """
365
366     _ready = False
367     _downloader = None
368     _x_forwarded_for_ip = None
369     _GEO_BYPASS = True
370     _GEO_COUNTRIES = None
371     _GEO_IP_BLOCKS = None
372     _WORKING = True
373
374     def __init__(self, downloader=None):
375         """Constructor. Receives an optional downloader."""
376         self._ready = False
377         self._x_forwarded_for_ip = None
378         self.set_downloader(downloader)
379
380     @classmethod
381     def suitable(cls, url):
382         """Receives a URL and returns True if suitable for this IE."""
383
384         # This does not use has/getattr intentionally - we want to know whether
385         # we have cached the regexp for *this* class, whereas getattr would also
386         # match the superclass
387         if '_VALID_URL_RE' not in cls.__dict__:
388             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
389         return cls._VALID_URL_RE.match(url) is not None
390
391     @classmethod
392     def _match_id(cls, url):
393         if '_VALID_URL_RE' not in cls.__dict__:
394             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
395         m = cls._VALID_URL_RE.match(url)
396         assert m
397         return compat_str(m.group('id'))
398
399     @classmethod
400     def working(cls):
401         """Getter method for _WORKING."""
402         return cls._WORKING
403
404     def initialize(self):
405         """Initializes an instance (authentication, etc)."""
406         self._initialize_geo_bypass({
407             'countries': self._GEO_COUNTRIES,
408             'ip_blocks': self._GEO_IP_BLOCKS,
409         })
410         if not self._ready:
411             self._real_initialize()
412             self._ready = True
413
414     def _initialize_geo_bypass(self, geo_bypass_context):
415         """
416         Initialize geo restriction bypass mechanism.
417
418         This method is used to initialize geo bypass mechanism based on faking
419         X-Forwarded-For HTTP header. A random country from provided country list
420         is selected and a random IP belonging to this country is generated. This
421         IP will be passed as X-Forwarded-For HTTP header in all subsequent
422         HTTP requests.
423
424         This method will be used for initial geo bypass mechanism initialization
425         during the instance initialization with _GEO_COUNTRIES and
426         _GEO_IP_BLOCKS.
427
428         You may also manually call it from extractor's code if geo bypass
429         information is not available beforehand (e.g. obtained during
430         extraction) or due to some other reason. In this case you should pass
431         this information in geo bypass context passed as first argument. It may
432         contain following fields:
433
434         countries:  List of geo unrestricted countries (similar
435                     to _GEO_COUNTRIES)
436         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
437                     (similar to _GEO_IP_BLOCKS)
438
439         """
440         if not self._x_forwarded_for_ip:
441
442             # Geo bypass mechanism is explicitly disabled by user
443             if not self._downloader.params.get('geo_bypass', True):
444                 return
445
446             if not geo_bypass_context:
447                 geo_bypass_context = {}
448
449             # Backward compatibility: previously _initialize_geo_bypass
450             # expected a list of countries, some 3rd party code may still use
451             # it this way
452             if isinstance(geo_bypass_context, (list, tuple)):
453                 geo_bypass_context = {
454                     'countries': geo_bypass_context,
455                 }
456
457             # The whole point of geo bypass mechanism is to fake IP
458             # as X-Forwarded-For HTTP header based on some IP block or
459             # country code.
460
461             # Path 1: bypassing based on IP block in CIDR notation
462
463             # Explicit IP block specified by user, use it right away
464             # regardless of whether extractor is geo bypassable or not
465             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
466
467             # Otherwise use random IP block from geo bypass context but only
468             # if extractor is known as geo bypassable
469             if not ip_block:
470                 ip_blocks = geo_bypass_context.get('ip_blocks')
471                 if self._GEO_BYPASS and ip_blocks:
472                     ip_block = random.choice(ip_blocks)
473
474             if ip_block:
475                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
476                 if self._downloader.params.get('verbose', False):
477                     self._downloader.to_screen(
478                         '[debug] Using fake IP %s as X-Forwarded-For.'
479                         % self._x_forwarded_for_ip)
480                 return
481
482             # Path 2: bypassing based on country code
483
484             # Explicit country code specified by user, use it right away
485             # regardless of whether extractor is geo bypassable or not
486             country = self._downloader.params.get('geo_bypass_country', None)
487
488             # Otherwise use random country code from geo bypass context but
489             # only if extractor is known as geo bypassable
490             if not country:
491                 countries = geo_bypass_context.get('countries')
492                 if self._GEO_BYPASS and countries:
493                     country = random.choice(countries)
494
495             if country:
496                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
497                 if self._downloader.params.get('verbose', False):
498                     self._downloader.to_screen(
499                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
500                         % (self._x_forwarded_for_ip, country.upper()))
501
502     def extract(self, url):
503         """Extracts URL information and returns it in list of dicts."""
504         try:
505             for _ in range(2):
506                 try:
507                     self.initialize()
508                     ie_result = self._real_extract(url)
509                     if self._x_forwarded_for_ip:
510                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
511                     return ie_result
512                 except GeoRestrictedError as e:
513                     if self.__maybe_fake_ip_and_retry(e.countries):
514                         continue
515                     raise
516         except ExtractorError:
517             raise
518         except compat_http_client.IncompleteRead as e:
519             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
520         except (KeyError, StopIteration) as e:
521             raise ExtractorError('An extractor error has occurred.', cause=e)
522
523     def __maybe_fake_ip_and_retry(self, countries):
524         if (not self._downloader.params.get('geo_bypass_country', None) and
525                 self._GEO_BYPASS and
526                 self._downloader.params.get('geo_bypass', True) and
527                 not self._x_forwarded_for_ip and
528                 countries):
529             country_code = random.choice(countries)
530             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
531             if self._x_forwarded_for_ip:
532                 self.report_warning(
533                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
534                     % (self._x_forwarded_for_ip, country_code.upper()))
535                 return True
536         return False
537
538     def set_downloader(self, downloader):
539         """Sets the downloader for this IE."""
540         self._downloader = downloader
541
542     def _real_initialize(self):
543         """Real initialization process. Redefine in subclasses."""
544         pass
545
546     def _real_extract(self, url):
547         """Real extraction process. Redefine in subclasses."""
548         pass
549
550     @classmethod
551     def ie_key(cls):
552         """A string for getting the InfoExtractor with get_info_extractor"""
553         return compat_str(cls.__name__[:-2])
554
555     @property
556     def IE_NAME(self):
557         return compat_str(type(self).__name__[:-2])
558
559     @staticmethod
560     def __can_accept_status_code(err, expected_status):
561         assert isinstance(err, compat_urllib_error.HTTPError)
562         if expected_status is None:
563             return False
564         if isinstance(expected_status, compat_integer_types):
565             return err.code == expected_status
566         elif isinstance(expected_status, (list, tuple)):
567             return err.code in expected_status
568         elif callable(expected_status):
569             return expected_status(err.code) is True
570         else:
571             assert False
572
573     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
574         """
575         Return the response handle.
576
577         See _download_webpage docstring for arguments specification.
578         """
579         if note is None:
580             self.report_download_webpage(video_id)
581         elif note is not False:
582             if video_id is None:
583                 self.to_screen('%s' % (note,))
584             else:
585                 self.to_screen('%s: %s' % (video_id, note))
586
587         # Some sites check X-Forwarded-For HTTP header in order to figure out
588         # the origin of the client behind proxy. This allows bypassing geo
589         # restriction by faking this header's value to IP that belongs to some
590         # geo unrestricted country. We will do so once we encounter any
591         # geo restriction error.
592         if self._x_forwarded_for_ip:
593             if 'X-Forwarded-For' not in headers:
594                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
595
596         if isinstance(url_or_request, compat_urllib_request.Request):
597             url_or_request = update_Request(
598                 url_or_request, data=data, headers=headers, query=query)
599         else:
600             if query:
601                 url_or_request = update_url_query(url_or_request, query)
602             if data is not None or headers:
603                 url_or_request = sanitized_Request(url_or_request, data, headers)
604         try:
605             return self._downloader.urlopen(url_or_request)
606         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
607             if isinstance(err, compat_urllib_error.HTTPError):
608                 if self.__can_accept_status_code(err, expected_status):
609                     return err.fp
610
611             if errnote is False:
612                 return False
613             if errnote is None:
614                 errnote = 'Unable to download webpage'
615
616             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
617             if fatal:
618                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
619             else:
620                 self._downloader.report_warning(errmsg)
621                 return False
622
623     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
624         """
625         Return a tuple (page content as string, URL handle).
626
627         See _download_webpage docstring for arguments specification.
628         """
629         # Strip hashes from the URL (#1038)
630         if isinstance(url_or_request, (compat_str, str)):
631             url_or_request = url_or_request.partition('#')[0]
632
633         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
634         if urlh is False:
635             assert not fatal
636             return False
637         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
638         return (content, urlh)
639
640     @staticmethod
641     def _guess_encoding_from_content(content_type, webpage_bytes):
642         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
643         if m:
644             encoding = m.group(1)
645         else:
646             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
647                           webpage_bytes[:1024])
648             if m:
649                 encoding = m.group(1).decode('ascii')
650             elif webpage_bytes.startswith(b'\xff\xfe'):
651                 encoding = 'utf-16'
652             else:
653                 encoding = 'utf-8'
654
655         return encoding
656
657     def __check_blocked(self, content):
658         first_block = content[:512]
659         if ('<title>Access to this site is blocked</title>' in content and
660                 'Websense' in first_block):
661             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
662             blocked_iframe = self._html_search_regex(
663                 r'<iframe src="([^"]+)"', content,
664                 'Websense information URL', default=None)
665             if blocked_iframe:
666                 msg += ' Visit %s for more details' % blocked_iframe
667             raise ExtractorError(msg, expected=True)
668         if '<title>The URL you requested has been blocked</title>' in first_block:
669             msg = (
670                 'Access to this webpage has been blocked by Indian censorship. '
671                 'Use a VPN or proxy server (with --proxy) to route around it.')
672             block_msg = self._html_search_regex(
673                 r'</h1><p>(.*?)</p>',
674                 content, 'block message', default=None)
675             if block_msg:
676                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
677             raise ExtractorError(msg, expected=True)
678         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
679                 'blocklist.rkn.gov.ru' in content):
680             raise ExtractorError(
681                 'Access to this webpage has been blocked by decision of the Russian government. '
682                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
683                 expected=True)
684
685     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
686         content_type = urlh.headers.get('Content-Type', '')
687         webpage_bytes = urlh.read()
688         if prefix is not None:
689             webpage_bytes = prefix + webpage_bytes
690         if not encoding:
691             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
692         if self._downloader.params.get('dump_intermediate_pages', False):
693             self.to_screen('Dumping request to ' + urlh.geturl())
694             dump = base64.b64encode(webpage_bytes).decode('ascii')
695             self._downloader.to_screen(dump)
696         if self._downloader.params.get('write_pages', False):
697             basen = '%s_%s' % (video_id, urlh.geturl())
698             if len(basen) > 240:
699                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
700                 basen = basen[:240 - len(h)] + h
701             raw_filename = basen + '.dump'
702             filename = sanitize_filename(raw_filename, restricted=True)
703             self.to_screen('Saving request to ' + filename)
704             # Working around MAX_PATH limitation on Windows (see
705             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
706             if compat_os_name == 'nt':
707                 absfilepath = os.path.abspath(filename)
708                 if len(absfilepath) > 259:
709                     filename = '\\\\?\\' + absfilepath
710             with open(filename, 'wb') as outf:
711                 outf.write(webpage_bytes)
712
713         try:
714             content = webpage_bytes.decode(encoding, 'replace')
715         except LookupError:
716             content = webpage_bytes.decode('utf-8', 'replace')
717
718         self.__check_blocked(content)
719
720         return content
721
722     def _download_webpage(
723             self, url_or_request, video_id, note=None, errnote=None,
724             fatal=True, tries=1, timeout=5, encoding=None, data=None,
725             headers={}, query={}, expected_status=None):
726         """
727         Return the data of the page as a string.
728
729         Arguments:
730         url_or_request -- plain text URL as a string or
731             a compat_urllib_request.Requestobject
732         video_id -- Video/playlist/item identifier (string)
733
734         Keyword arguments:
735         note -- note printed before downloading (string)
736         errnote -- note printed in case of an error (string)
737         fatal -- flag denoting whether error should be considered fatal,
738             i.e. whether it should cause ExtractionError to be raised,
739             otherwise a warning will be reported and extraction continued
740         tries -- number of tries
741         timeout -- sleep interval between tries
742         encoding -- encoding for a page content decoding, guessed automatically
743             when not explicitly specified
744         data -- POST data (bytes)
745         headers -- HTTP headers (dict)
746         query -- URL query (dict)
747         expected_status -- allows to accept failed HTTP requests (non 2xx
748             status code) by explicitly specifying a set of accepted status
749             codes. Can be any of the following entities:
750                 - an integer type specifying an exact failed status code to
751                   accept
752                 - a list or a tuple of integer types specifying a list of
753                   failed status codes to accept
754                 - a callable accepting an actual failed status code and
755                   returning True if it should be accepted
756             Note that this argument does not affect success status codes (2xx)
757             which are always accepted.
758         """
759
760         success = False
761         try_count = 0
762         while success is False:
763             try:
764                 res = self._download_webpage_handle(
765                     url_or_request, video_id, note, errnote, fatal,
766                     encoding=encoding, data=data, headers=headers, query=query,
767                     expected_status=expected_status)
768                 success = True
769             except compat_http_client.IncompleteRead as e:
770                 try_count += 1
771                 if try_count >= tries:
772                     raise e
773                 self._sleep(timeout, video_id)
774         if res is False:
775             return res
776         else:
777             content, _ = res
778             return content
779
780     def _download_xml_handle(
781             self, url_or_request, video_id, note='Downloading XML',
782             errnote='Unable to download XML', transform_source=None,
783             fatal=True, encoding=None, data=None, headers={}, query={},
784             expected_status=None):
785         """
786         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
787
788         See _download_webpage docstring for arguments specification.
789         """
790         res = self._download_webpage_handle(
791             url_or_request, video_id, note, errnote, fatal=fatal,
792             encoding=encoding, data=data, headers=headers, query=query,
793             expected_status=expected_status)
794         if res is False:
795             return res
796         xml_string, urlh = res
797         return self._parse_xml(
798             xml_string, video_id, transform_source=transform_source,
799             fatal=fatal), urlh
800
801     def _download_xml(
802             self, url_or_request, video_id,
803             note='Downloading XML', errnote='Unable to download XML',
804             transform_source=None, fatal=True, encoding=None,
805             data=None, headers={}, query={}, expected_status=None):
806         """
807         Return the xml as an xml.etree.ElementTree.Element.
808
809         See _download_webpage docstring for arguments specification.
810         """
811         res = self._download_xml_handle(
812             url_or_request, video_id, note=note, errnote=errnote,
813             transform_source=transform_source, fatal=fatal, encoding=encoding,
814             data=data, headers=headers, query=query,
815             expected_status=expected_status)
816         return res if res is False else res[0]
817
818     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
819         if transform_source:
820             xml_string = transform_source(xml_string)
821         try:
822             return compat_etree_fromstring(xml_string.encode('utf-8'))
823         except compat_xml_parse_error as ve:
824             errmsg = '%s: Failed to parse XML ' % video_id
825             if fatal:
826                 raise ExtractorError(errmsg, cause=ve)
827             else:
828                 self.report_warning(errmsg + str(ve))
829
830     def _download_json_handle(
831             self, url_or_request, video_id, note='Downloading JSON metadata',
832             errnote='Unable to download JSON metadata', transform_source=None,
833             fatal=True, encoding=None, data=None, headers={}, query={},
834             expected_status=None):
835         """
836         Return a tuple (JSON object, URL handle).
837
838         See _download_webpage docstring for arguments specification.
839         """
840         res = self._download_webpage_handle(
841             url_or_request, video_id, note, errnote, fatal=fatal,
842             encoding=encoding, data=data, headers=headers, query=query,
843             expected_status=expected_status)
844         if res is False:
845             return res
846         json_string, urlh = res
847         return self._parse_json(
848             json_string, video_id, transform_source=transform_source,
849             fatal=fatal), urlh
850
851     def _download_json(
852             self, url_or_request, video_id, note='Downloading JSON metadata',
853             errnote='Unable to download JSON metadata', transform_source=None,
854             fatal=True, encoding=None, data=None, headers={}, query={},
855             expected_status=None):
856         """
857         Return the JSON object as a dict.
858
859         See _download_webpage docstring for arguments specification.
860         """
861         res = self._download_json_handle(
862             url_or_request, video_id, note=note, errnote=errnote,
863             transform_source=transform_source, fatal=fatal, encoding=encoding,
864             data=data, headers=headers, query=query,
865             expected_status=expected_status)
866         return res if res is False else res[0]
867
868     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
869         if transform_source:
870             json_string = transform_source(json_string)
871         try:
872             return json.loads(json_string)
873         except ValueError as ve:
874             errmsg = '%s: Failed to parse JSON ' % video_id
875             if fatal:
876                 raise ExtractorError(errmsg, cause=ve)
877             else:
878                 self.report_warning(errmsg + str(ve))
879
880     def report_warning(self, msg, video_id=None):
881         idstr = '' if video_id is None else '%s: ' % video_id
882         self._downloader.report_warning(
883             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
884
885     def to_screen(self, msg):
886         """Print msg to screen, prefixing it with '[ie_name]'"""
887         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
888
889     def report_extraction(self, id_or_name):
890         """Report information extraction."""
891         self.to_screen('%s: Extracting information' % id_or_name)
892
893     def report_download_webpage(self, video_id):
894         """Report webpage download."""
895         self.to_screen('%s: Downloading webpage' % video_id)
896
897     def report_age_confirmation(self):
898         """Report attempt to confirm age."""
899         self.to_screen('Confirming age')
900
901     def report_login(self):
902         """Report attempt to log in."""
903         self.to_screen('Logging in')
904
905     @staticmethod
906     def raise_login_required(msg='This video is only available for registered users'):
907         raise ExtractorError(
908             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
909             expected=True)
910
911     @staticmethod
912     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
913         raise GeoRestrictedError(msg, countries=countries)
914
915     # Methods for following #608
916     @staticmethod
917     def url_result(url, ie=None, video_id=None, video_title=None):
918         """Returns a URL that points to a page that should be processed"""
919         # TODO: ie should be the class used for getting the info
920         video_info = {'_type': 'url',
921                       'url': url,
922                       'ie_key': ie}
923         if video_id is not None:
924             video_info['id'] = video_id
925         if video_title is not None:
926             video_info['title'] = video_title
927         return video_info
928
929     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
930         urls = orderedSet(
931             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
932             for m in matches)
933         return self.playlist_result(
934             urls, playlist_id=playlist_id, playlist_title=playlist_title)
935
936     @staticmethod
937     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
938         """Returns a playlist"""
939         video_info = {'_type': 'playlist',
940                       'entries': entries}
941         if playlist_id:
942             video_info['id'] = playlist_id
943         if playlist_title:
944             video_info['title'] = playlist_title
945         if playlist_description:
946             video_info['description'] = playlist_description
947         return video_info
948
949     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
950         """
951         Perform a regex search on the given string, using a single or a list of
952         patterns returning the first matching group.
953         In case of failure return a default value or raise a WARNING or a
954         RegexNotFoundError, depending on fatal, specifying the field name.
955         """
956         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
957             mobj = re.search(pattern, string, flags)
958         else:
959             for p in pattern:
960                 mobj = re.search(p, string, flags)
961                 if mobj:
962                     break
963
964         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
965             _name = '\033[0;34m%s\033[0m' % name
966         else:
967             _name = name
968
969         if mobj:
970             if group is None:
971                 # return the first matching group
972                 return next(g for g in mobj.groups() if g is not None)
973             else:
974                 return mobj.group(group)
975         elif default is not NO_DEFAULT:
976             return default
977         elif fatal:
978             raise RegexNotFoundError('Unable to extract %s' % _name)
979         else:
980             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
981             return None
982
983     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
984         """
985         Like _search_regex, but strips HTML tags and unescapes entities.
986         """
987         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
988         if res:
989             return clean_html(res).strip()
990         else:
991             return res
992
993     def _get_netrc_login_info(self, netrc_machine=None):
994         username = None
995         password = None
996         netrc_machine = netrc_machine or self._NETRC_MACHINE
997
998         if self._downloader.params.get('usenetrc', False):
999             try:
1000                 info = netrc.netrc().authenticators(netrc_machine)
1001                 if info is not None:
1002                     username = info[0]
1003                     password = info[2]
1004                 else:
1005                     raise netrc.NetrcParseError(
1006                         'No authenticators for %s' % netrc_machine)
1007             except (IOError, netrc.NetrcParseError) as err:
1008                 self._downloader.report_warning(
1009                     'parsing .netrc: %s' % error_to_compat_str(err))
1010
1011         return username, password
1012
1013     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1014         """
1015         Get the login info as (username, password)
1016         First look for the manually specified credentials using username_option
1017         and password_option as keys in params dictionary. If no such credentials
1018         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1019         value.
1020         If there's no info available, return (None, None)
1021         """
1022         if self._downloader is None:
1023             return (None, None)
1024
1025         downloader_params = self._downloader.params
1026
1027         # Attempt to use provided username and password or .netrc data
1028         if downloader_params.get(username_option) is not None:
1029             username = downloader_params[username_option]
1030             password = downloader_params[password_option]
1031         else:
1032             username, password = self._get_netrc_login_info(netrc_machine)
1033
1034         return username, password
1035
1036     def _get_tfa_info(self, note='two-factor verification code'):
1037         """
1038         Get the two-factor authentication info
1039         TODO - asking the user will be required for sms/phone verify
1040         currently just uses the command line option
1041         If there's no info available, return None
1042         """
1043         if self._downloader is None:
1044             return None
1045         downloader_params = self._downloader.params
1046
1047         if downloader_params.get('twofactor') is not None:
1048             return downloader_params['twofactor']
1049
1050         return compat_getpass('Type %s and press [Return]: ' % note)
1051
1052     # Helper functions for extracting OpenGraph info
1053     @staticmethod
1054     def _og_regexes(prop):
1055         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1056         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1057                        % {'prop': re.escape(prop)})
1058         template = r'<meta[^>]+?%s[^>]+?%s'
1059         return [
1060             template % (property_re, content_re),
1061             template % (content_re, property_re),
1062         ]
1063
1064     @staticmethod
1065     def _meta_regex(prop):
1066         return r'''(?isx)<meta
1067                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1068                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1069
1070     def _og_search_property(self, prop, html, name=None, **kargs):
1071         if not isinstance(prop, (list, tuple)):
1072             prop = [prop]
1073         if name is None:
1074             name = 'OpenGraph %s' % prop[0]
1075         og_regexes = []
1076         for p in prop:
1077             og_regexes.extend(self._og_regexes(p))
1078         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1079         if escaped is None:
1080             return None
1081         return unescapeHTML(escaped)
1082
1083     def _og_search_thumbnail(self, html, **kargs):
1084         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1085
1086     def _og_search_description(self, html, **kargs):
1087         return self._og_search_property('description', html, fatal=False, **kargs)
1088
1089     def _og_search_title(self, html, **kargs):
1090         return self._og_search_property('title', html, **kargs)
1091
1092     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1093         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1094         if secure:
1095             regexes = self._og_regexes('video:secure_url') + regexes
1096         return self._html_search_regex(regexes, html, name, **kargs)
1097
1098     def _og_search_url(self, html, **kargs):
1099         return self._og_search_property('url', html, **kargs)
1100
1101     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1102         if not isinstance(name, (list, tuple)):
1103             name = [name]
1104         if display_name is None:
1105             display_name = name[0]
1106         return self._html_search_regex(
1107             [self._meta_regex(n) for n in name],
1108             html, display_name, fatal=fatal, group='content', **kwargs)
1109
1110     def _dc_search_uploader(self, html):
1111         return self._html_search_meta('dc.creator', html, 'uploader')
1112
1113     def _rta_search(self, html):
1114         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1115         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1116                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1117                      html):
1118             return 18
1119         return 0
1120
1121     def _media_rating_search(self, html):
1122         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1123         rating = self._html_search_meta('rating', html)
1124
1125         if not rating:
1126             return None
1127
1128         RATING_TABLE = {
1129             'safe for kids': 0,
1130             'general': 8,
1131             '14 years': 14,
1132             'mature': 17,
1133             'restricted': 19,
1134         }
1135         return RATING_TABLE.get(rating.lower())
1136
1137     def _family_friendly_search(self, html):
1138         # See http://schema.org/VideoObject
1139         family_friendly = self._html_search_meta(
1140             'isFamilyFriendly', html, default=None)
1141
1142         if not family_friendly:
1143             return None
1144
1145         RATING_TABLE = {
1146             '1': 0,
1147             'true': 0,
1148             '0': 18,
1149             'false': 18,
1150         }
1151         return RATING_TABLE.get(family_friendly.lower())
1152
1153     def _twitter_search_player(self, html):
1154         return self._html_search_meta('twitter:player', html,
1155                                       'twitter card player')
1156
1157     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1158         json_ld = self._search_regex(
1159             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1160         default = kwargs.get('default', NO_DEFAULT)
1161         if not json_ld:
1162             return default if default is not NO_DEFAULT else {}
1163         # JSON-LD may be malformed and thus `fatal` should be respected.
1164         # At the same time `default` may be passed that assumes `fatal=False`
1165         # for _search_regex. Let's simulate the same behavior here as well.
1166         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1167         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1168
1169     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1170         if isinstance(json_ld, compat_str):
1171             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1172         if not json_ld:
1173             return {}
1174         info = {}
1175         if not isinstance(json_ld, (list, tuple, dict)):
1176             return info
1177         if isinstance(json_ld, dict):
1178             json_ld = [json_ld]
1179
1180         INTERACTION_TYPE_MAP = {
1181             'CommentAction': 'comment',
1182             'AgreeAction': 'like',
1183             'DisagreeAction': 'dislike',
1184             'LikeAction': 'like',
1185             'DislikeAction': 'dislike',
1186             'ListenAction': 'view',
1187             'WatchAction': 'view',
1188             'ViewAction': 'view',
1189         }
1190
1191         def extract_interaction_statistic(e):
1192             interaction_statistic = e.get('interactionStatistic')
1193             if not isinstance(interaction_statistic, list):
1194                 return
1195             for is_e in interaction_statistic:
1196                 if not isinstance(is_e, dict):
1197                     continue
1198                 if is_e.get('@type') != 'InteractionCounter':
1199                     continue
1200                 interaction_type = is_e.get('interactionType')
1201                 if not isinstance(interaction_type, compat_str):
1202                     continue
1203                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1204                 if interaction_count is None:
1205                     continue
1206                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1207                 if not count_kind:
1208                     continue
1209                 count_key = '%s_count' % count_kind
1210                 if info.get(count_key) is not None:
1211                     continue
1212                 info[count_key] = interaction_count
1213
1214         def extract_video_object(e):
1215             assert e['@type'] == 'VideoObject'
1216             info.update({
1217                 'url': url_or_none(e.get('contentUrl')),
1218                 'title': unescapeHTML(e.get('name')),
1219                 'description': unescapeHTML(e.get('description')),
1220                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1221                 'duration': parse_duration(e.get('duration')),
1222                 'timestamp': unified_timestamp(e.get('uploadDate')),
1223                 'filesize': float_or_none(e.get('contentSize')),
1224                 'tbr': int_or_none(e.get('bitrate')),
1225                 'width': int_or_none(e.get('width')),
1226                 'height': int_or_none(e.get('height')),
1227                 'view_count': int_or_none(e.get('interactionCount')),
1228             })
1229             extract_interaction_statistic(e)
1230
1231         for e in json_ld:
1232             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1233                 item_type = e.get('@type')
1234                 if expected_type is not None and expected_type != item_type:
1235                     return info
1236                 if item_type in ('TVEpisode', 'Episode'):
1237                     info.update({
1238                         'episode': unescapeHTML(e.get('name')),
1239                         'episode_number': int_or_none(e.get('episodeNumber')),
1240                         'description': unescapeHTML(e.get('description')),
1241                     })
1242                     part_of_season = e.get('partOfSeason')
1243                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1244                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1245                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1246                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1247                         info['series'] = unescapeHTML(part_of_series.get('name'))
1248                 elif item_type in ('Article', 'NewsArticle'):
1249                     info.update({
1250                         'timestamp': parse_iso8601(e.get('datePublished')),
1251                         'title': unescapeHTML(e.get('headline')),
1252                         'description': unescapeHTML(e.get('articleBody')),
1253                     })
1254                 elif item_type == 'VideoObject':
1255                     extract_video_object(e)
1256                     continue
1257                 video = e.get('video')
1258                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1259                     extract_video_object(video)
1260                 break
1261         return dict((k, v) for k, v in info.items() if v is not None)
1262
1263     @staticmethod
1264     def _hidden_inputs(html):
1265         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1266         hidden_inputs = {}
1267         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1268             attrs = extract_attributes(input)
1269             if not input:
1270                 continue
1271             if attrs.get('type') not in ('hidden', 'submit'):
1272                 continue
1273             name = attrs.get('name') or attrs.get('id')
1274             value = attrs.get('value')
1275             if name and value is not None:
1276                 hidden_inputs[name] = value
1277         return hidden_inputs
1278
1279     def _form_hidden_inputs(self, form_id, html):
1280         form = self._search_regex(
1281             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1282             html, '%s form' % form_id, group='form')
1283         return self._hidden_inputs(form)
1284
1285     def _sort_formats(self, formats, field_preference=None):
1286         if not formats:
1287             raise ExtractorError('No video formats found')
1288
1289         for f in formats:
1290             # Automatically determine tbr when missing based on abr and vbr (improves
1291             # formats sorting in some cases)
1292             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1293                 f['tbr'] = f['abr'] + f['vbr']
1294
1295         def _formats_key(f):
1296             # TODO remove the following workaround
1297             from ..utils import determine_ext
1298             if not f.get('ext') and 'url' in f:
1299                 f['ext'] = determine_ext(f['url'])
1300
1301             if isinstance(field_preference, (list, tuple)):
1302                 return tuple(
1303                     f.get(field)
1304                     if f.get(field) is not None
1305                     else ('' if field == 'format_id' else -1)
1306                     for field in field_preference)
1307
1308             preference = f.get('preference')
1309             if preference is None:
1310                 preference = 0
1311                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1312                     preference -= 0.5
1313
1314             protocol = f.get('protocol') or determine_protocol(f)
1315             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1316
1317             if f.get('vcodec') == 'none':  # audio only
1318                 preference -= 50
1319                 if self._downloader.params.get('prefer_free_formats'):
1320                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1321                 else:
1322                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1323                 ext_preference = 0
1324                 try:
1325                     audio_ext_preference = ORDER.index(f['ext'])
1326                 except ValueError:
1327                     audio_ext_preference = -1
1328             else:
1329                 if f.get('acodec') == 'none':  # video only
1330                     preference -= 40
1331                 if self._downloader.params.get('prefer_free_formats'):
1332                     ORDER = ['flv', 'mp4', 'webm']
1333                 else:
1334                     ORDER = ['webm', 'flv', 'mp4']
1335                 try:
1336                     ext_preference = ORDER.index(f['ext'])
1337                 except ValueError:
1338                     ext_preference = -1
1339                 audio_ext_preference = 0
1340
1341             return (
1342                 preference,
1343                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1344                 f.get('quality') if f.get('quality') is not None else -1,
1345                 f.get('tbr') if f.get('tbr') is not None else -1,
1346                 f.get('filesize') if f.get('filesize') is not None else -1,
1347                 f.get('vbr') if f.get('vbr') is not None else -1,
1348                 f.get('height') if f.get('height') is not None else -1,
1349                 f.get('width') if f.get('width') is not None else -1,
1350                 proto_preference,
1351                 ext_preference,
1352                 f.get('abr') if f.get('abr') is not None else -1,
1353                 audio_ext_preference,
1354                 f.get('fps') if f.get('fps') is not None else -1,
1355                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1356                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1357                 f.get('format_id') if f.get('format_id') is not None else '',
1358             )
1359         formats.sort(key=_formats_key)
1360
1361     def _check_formats(self, formats, video_id):
1362         if formats:
1363             formats[:] = filter(
1364                 lambda f: self._is_valid_url(
1365                     f['url'], video_id,
1366                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1367                 formats)
1368
1369     @staticmethod
1370     def _remove_duplicate_formats(formats):
1371         format_urls = set()
1372         unique_formats = []
1373         for f in formats:
1374             if f['url'] not in format_urls:
1375                 format_urls.add(f['url'])
1376                 unique_formats.append(f)
1377         formats[:] = unique_formats
1378
1379     def _is_valid_url(self, url, video_id, item='video', headers={}):
1380         url = self._proto_relative_url(url, scheme='http:')
1381         # For now assume non HTTP(S) URLs always valid
1382         if not (url.startswith('http://') or url.startswith('https://')):
1383             return True
1384         try:
1385             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1386             return True
1387         except ExtractorError as e:
1388             if isinstance(e.cause, compat_urllib_error.URLError):
1389                 self.to_screen(
1390                     '%s: %s URL is invalid, skipping' % (video_id, item))
1391                 return False
1392             raise
1393
1394     def http_scheme(self):
1395         """ Either "http:" or "https:", depending on the user's preferences """
1396         return (
1397             'http:'
1398             if self._downloader.params.get('prefer_insecure', False)
1399             else 'https:')
1400
1401     def _proto_relative_url(self, url, scheme=None):
1402         if url is None:
1403             return url
1404         if url.startswith('//'):
1405             if scheme is None:
1406                 scheme = self.http_scheme()
1407             return scheme + url
1408         else:
1409             return url
1410
1411     def _sleep(self, timeout, video_id, msg_template=None):
1412         if msg_template is None:
1413             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1414         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1415         self.to_screen(msg)
1416         time.sleep(timeout)
1417
1418     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1419                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1420                              fatal=True, m3u8_id=None):
1421         manifest = self._download_xml(
1422             manifest_url, video_id, 'Downloading f4m manifest',
1423             'Unable to download f4m manifest',
1424             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1425             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1426             transform_source=transform_source,
1427             fatal=fatal)
1428
1429         if manifest is False:
1430             return []
1431
1432         return self._parse_f4m_formats(
1433             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1434             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1435
1436     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1437                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1438                            fatal=True, m3u8_id=None):
1439         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1440         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1441         if akamai_pv is not None and ';' in akamai_pv.text:
1442             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1443             if playerVerificationChallenge.strip() != '':
1444                 return []
1445
1446         formats = []
1447         manifest_version = '1.0'
1448         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1449         if not media_nodes:
1450             manifest_version = '2.0'
1451             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1452         # Remove unsupported DRM protected media from final formats
1453         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1454         media_nodes = remove_encrypted_media(media_nodes)
1455         if not media_nodes:
1456             return formats
1457
1458         manifest_base_url = get_base_url(manifest)
1459
1460         bootstrap_info = xpath_element(
1461             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1462             'bootstrap info', default=None)
1463
1464         vcodec = None
1465         mime_type = xpath_text(
1466             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1467             'base URL', default=None)
1468         if mime_type and mime_type.startswith('audio/'):
1469             vcodec = 'none'
1470
1471         for i, media_el in enumerate(media_nodes):
1472             tbr = int_or_none(media_el.attrib.get('bitrate'))
1473             width = int_or_none(media_el.attrib.get('width'))
1474             height = int_or_none(media_el.attrib.get('height'))
1475             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1476             # If <bootstrapInfo> is present, the specified f4m is a
1477             # stream-level manifest, and only set-level manifests may refer to
1478             # external resources.  See section 11.4 and section 4 of F4M spec
1479             if bootstrap_info is None:
1480                 media_url = None
1481                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1482                 if manifest_version == '2.0':
1483                     media_url = media_el.attrib.get('href')
1484                 if media_url is None:
1485                     media_url = media_el.attrib.get('url')
1486                 if not media_url:
1487                     continue
1488                 manifest_url = (
1489                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1490                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1491                 # If media_url is itself a f4m manifest do the recursive extraction
1492                 # since bitrates in parent manifest (this one) and media_url manifest
1493                 # may differ leading to inability to resolve the format by requested
1494                 # bitrate in f4m downloader
1495                 ext = determine_ext(manifest_url)
1496                 if ext == 'f4m':
1497                     f4m_formats = self._extract_f4m_formats(
1498                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1499                         transform_source=transform_source, fatal=fatal)
1500                     # Sometimes stream-level manifest contains single media entry that
1501                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1502                     # At the same time parent's media entry in set-level manifest may
1503                     # contain it. We will copy it from parent in such cases.
1504                     if len(f4m_formats) == 1:
1505                         f = f4m_formats[0]
1506                         f.update({
1507                             'tbr': f.get('tbr') or tbr,
1508                             'width': f.get('width') or width,
1509                             'height': f.get('height') or height,
1510                             'format_id': f.get('format_id') if not tbr else format_id,
1511                             'vcodec': vcodec,
1512                         })
1513                     formats.extend(f4m_formats)
1514                     continue
1515                 elif ext == 'm3u8':
1516                     formats.extend(self._extract_m3u8_formats(
1517                         manifest_url, video_id, 'mp4', preference=preference,
1518                         m3u8_id=m3u8_id, fatal=fatal))
1519                     continue
1520             formats.append({
1521                 'format_id': format_id,
1522                 'url': manifest_url,
1523                 'manifest_url': manifest_url,
1524                 'ext': 'flv' if bootstrap_info is not None else None,
1525                 'protocol': 'f4m',
1526                 'tbr': tbr,
1527                 'width': width,
1528                 'height': height,
1529                 'vcodec': vcodec,
1530                 'preference': preference,
1531             })
1532         return formats
1533
1534     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1535         return {
1536             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1537             'url': m3u8_url,
1538             'ext': ext,
1539             'protocol': 'm3u8',
1540             'preference': preference - 100 if preference else -100,
1541             'resolution': 'multiple',
1542             'format_note': 'Quality selection URL',
1543         }
1544
1545     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1546                               entry_protocol='m3u8', preference=None,
1547                               m3u8_id=None, note=None, errnote=None,
1548                               fatal=True, live=False):
1549         res = self._download_webpage_handle(
1550             m3u8_url, video_id,
1551             note=note or 'Downloading m3u8 information',
1552             errnote=errnote or 'Failed to download m3u8 information',
1553             fatal=fatal)
1554
1555         if res is False:
1556             return []
1557
1558         m3u8_doc, urlh = res
1559         m3u8_url = urlh.geturl()
1560
1561         return self._parse_m3u8_formats(
1562             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1563             preference=preference, m3u8_id=m3u8_id, live=live)
1564
1565     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1566                             entry_protocol='m3u8', preference=None,
1567                             m3u8_id=None, live=False):
1568         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1569             return []
1570
1571         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1572             return []
1573
1574         formats = []
1575
1576         format_url = lambda u: (
1577             u
1578             if re.match(r'^https?://', u)
1579             else compat_urlparse.urljoin(m3u8_url, u))
1580
1581         # References:
1582         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1583         # 2. https://github.com/rg3/youtube-dl/issues/12211
1584
1585         # We should try extracting formats only from master playlists [1, 4.3.4],
1586         # i.e. playlists that describe available qualities. On the other hand
1587         # media playlists [1, 4.3.3] should be returned as is since they contain
1588         # just the media without qualities renditions.
1589         # Fortunately, master playlist can be easily distinguished from media
1590         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1591         # master playlist tags MUST NOT appear in a media playist and vice versa.
1592         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1593         # media playlist and MUST NOT appear in master playlist thus we can
1594         # clearly detect media playlist with this criterion.
1595
1596         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1597             return [{
1598                 'url': m3u8_url,
1599                 'format_id': m3u8_id,
1600                 'ext': ext,
1601                 'protocol': entry_protocol,
1602                 'preference': preference,
1603             }]
1604
1605         groups = {}
1606         last_stream_inf = {}
1607
1608         def extract_media(x_media_line):
1609             media = parse_m3u8_attributes(x_media_line)
1610             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1611             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1612             if not (media_type and group_id and name):
1613                 return
1614             groups.setdefault(group_id, []).append(media)
1615             if media_type not in ('VIDEO', 'AUDIO'):
1616                 return
1617             media_url = media.get('URI')
1618             if media_url:
1619                 format_id = []
1620                 for v in (m3u8_id, group_id, name):
1621                     if v:
1622                         format_id.append(v)
1623                 f = {
1624                     'format_id': '-'.join(format_id),
1625                     'url': format_url(media_url),
1626                     'manifest_url': m3u8_url,
1627                     'language': media.get('LANGUAGE'),
1628                     'ext': ext,
1629                     'protocol': entry_protocol,
1630                     'preference': preference,
1631                 }
1632                 if media_type == 'AUDIO':
1633                     f['vcodec'] = 'none'
1634                 formats.append(f)
1635
1636         def build_stream_name():
1637             # Despite specification does not mention NAME attribute for
1638             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1639             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1640             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1641             stream_name = last_stream_inf.get('NAME')
1642             if stream_name:
1643                 return stream_name
1644             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1645             # from corresponding rendition group
1646             stream_group_id = last_stream_inf.get('VIDEO')
1647             if not stream_group_id:
1648                 return
1649             stream_group = groups.get(stream_group_id)
1650             if not stream_group:
1651                 return stream_group_id
1652             rendition = stream_group[0]
1653             return rendition.get('NAME') or stream_group_id
1654
1655         for line in m3u8_doc.splitlines():
1656             if line.startswith('#EXT-X-STREAM-INF:'):
1657                 last_stream_inf = parse_m3u8_attributes(line)
1658             elif line.startswith('#EXT-X-MEDIA:'):
1659                 extract_media(line)
1660             elif line.startswith('#') or not line.strip():
1661                 continue
1662             else:
1663                 tbr = float_or_none(
1664                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1665                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1666                 format_id = []
1667                 if m3u8_id:
1668                     format_id.append(m3u8_id)
1669                 stream_name = build_stream_name()
1670                 # Bandwidth of live streams may differ over time thus making
1671                 # format_id unpredictable. So it's better to keep provided
1672                 # format_id intact.
1673                 if not live:
1674                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1675                 manifest_url = format_url(line.strip())
1676                 f = {
1677                     'format_id': '-'.join(format_id),
1678                     'url': manifest_url,
1679                     'manifest_url': m3u8_url,
1680                     'tbr': tbr,
1681                     'ext': ext,
1682                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1683                     'protocol': entry_protocol,
1684                     'preference': preference,
1685                 }
1686                 resolution = last_stream_inf.get('RESOLUTION')
1687                 if resolution:
1688                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1689                     if mobj:
1690                         f['width'] = int(mobj.group('width'))
1691                         f['height'] = int(mobj.group('height'))
1692                 # Unified Streaming Platform
1693                 mobj = re.search(
1694                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1695                 if mobj:
1696                     abr, vbr = mobj.groups()
1697                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1698                     f.update({
1699                         'vbr': vbr,
1700                         'abr': abr,
1701                     })
1702                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1703                 f.update(codecs)
1704                 audio_group_id = last_stream_inf.get('AUDIO')
1705                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1706                 # references a rendition group MUST have a CODECS attribute.
1707                 # However, this is not always respected, for example, [2]
1708                 # contains EXT-X-STREAM-INF tag which references AUDIO
1709                 # rendition group but does not have CODECS and despite
1710                 # referencing an audio group it represents a complete
1711                 # (with audio and video) format. So, for such cases we will
1712                 # ignore references to rendition groups and treat them
1713                 # as complete formats.
1714                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1715                     audio_group = groups.get(audio_group_id)
1716                     if audio_group and audio_group[0].get('URI'):
1717                         # TODO: update acodec for audio only formats with
1718                         # the same GROUP-ID
1719                         f['acodec'] = 'none'
1720                 formats.append(f)
1721                 last_stream_inf = {}
1722         return formats
1723
1724     @staticmethod
1725     def _xpath_ns(path, namespace=None):
1726         if not namespace:
1727             return path
1728         out = []
1729         for c in path.split('/'):
1730             if not c or c == '.':
1731                 out.append(c)
1732             else:
1733                 out.append('{%s}%s' % (namespace, c))
1734         return '/'.join(out)
1735
1736     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1737         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1738
1739         if smil is False:
1740             assert not fatal
1741             return []
1742
1743         namespace = self._parse_smil_namespace(smil)
1744
1745         return self._parse_smil_formats(
1746             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1747
1748     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1749         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1750         if smil is False:
1751             return {}
1752         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1753
1754     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1755         return self._download_xml(
1756             smil_url, video_id, 'Downloading SMIL file',
1757             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1758
1759     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1760         namespace = self._parse_smil_namespace(smil)
1761
1762         formats = self._parse_smil_formats(
1763             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1764         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1765
1766         video_id = os.path.splitext(url_basename(smil_url))[0]
1767         title = None
1768         description = None
1769         upload_date = None
1770         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1771             name = meta.attrib.get('name')
1772             content = meta.attrib.get('content')
1773             if not name or not content:
1774                 continue
1775             if not title and name == 'title':
1776                 title = content
1777             elif not description and name in ('description', 'abstract'):
1778                 description = content
1779             elif not upload_date and name == 'date':
1780                 upload_date = unified_strdate(content)
1781
1782         thumbnails = [{
1783             'id': image.get('type'),
1784             'url': image.get('src'),
1785             'width': int_or_none(image.get('width')),
1786             'height': int_or_none(image.get('height')),
1787         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1788
1789         return {
1790             'id': video_id,
1791             'title': title or video_id,
1792             'description': description,
1793             'upload_date': upload_date,
1794             'thumbnails': thumbnails,
1795             'formats': formats,
1796             'subtitles': subtitles,
1797         }
1798
1799     def _parse_smil_namespace(self, smil):
1800         return self._search_regex(
1801             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1802
1803     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1804         base = smil_url
1805         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1806             b = meta.get('base') or meta.get('httpBase')
1807             if b:
1808                 base = b
1809                 break
1810
1811         formats = []
1812         rtmp_count = 0
1813         http_count = 0
1814         m3u8_count = 0
1815
1816         srcs = []
1817         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1818         for medium in media:
1819             src = medium.get('src')
1820             if not src or src in srcs:
1821                 continue
1822             srcs.append(src)
1823
1824             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1825             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1826             width = int_or_none(medium.get('width'))
1827             height = int_or_none(medium.get('height'))
1828             proto = medium.get('proto')
1829             ext = medium.get('ext')
1830             src_ext = determine_ext(src)
1831             streamer = medium.get('streamer') or base
1832
1833             if proto == 'rtmp' or streamer.startswith('rtmp'):
1834                 rtmp_count += 1
1835                 formats.append({
1836                     'url': streamer,
1837                     'play_path': src,
1838                     'ext': 'flv',
1839                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1840                     'tbr': bitrate,
1841                     'filesize': filesize,
1842                     'width': width,
1843                     'height': height,
1844                 })
1845                 if transform_rtmp_url:
1846                     streamer, src = transform_rtmp_url(streamer, src)
1847                     formats[-1].update({
1848                         'url': streamer,
1849                         'play_path': src,
1850                     })
1851                 continue
1852
1853             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1854             src_url = src_url.strip()
1855
1856             if proto == 'm3u8' or src_ext == 'm3u8':
1857                 m3u8_formats = self._extract_m3u8_formats(
1858                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1859                 if len(m3u8_formats) == 1:
1860                     m3u8_count += 1
1861                     m3u8_formats[0].update({
1862                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1863                         'tbr': bitrate,
1864                         'width': width,
1865                         'height': height,
1866                     })
1867                 formats.extend(m3u8_formats)
1868             elif src_ext == 'f4m':
1869                 f4m_url = src_url
1870                 if not f4m_params:
1871                     f4m_params = {
1872                         'hdcore': '3.2.0',
1873                         'plugin': 'flowplayer-3.2.0.1',
1874                     }
1875                 f4m_url += '&' if '?' in f4m_url else '?'
1876                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1877                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1878             elif src_ext == 'mpd':
1879                 formats.extend(self._extract_mpd_formats(
1880                     src_url, video_id, mpd_id='dash', fatal=False))
1881             elif re.search(r'\.ism/[Mm]anifest', src_url):
1882                 formats.extend(self._extract_ism_formats(
1883                     src_url, video_id, ism_id='mss', fatal=False))
1884             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1885                 http_count += 1
1886                 formats.append({
1887                     'url': src_url,
1888                     'ext': ext or src_ext or 'flv',
1889                     'format_id': 'http-%d' % (bitrate or http_count),
1890                     'tbr': bitrate,
1891                     'filesize': filesize,
1892                     'width': width,
1893                     'height': height,
1894                 })
1895
1896         return formats
1897
1898     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1899         urls = []
1900         subtitles = {}
1901         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1902             src = textstream.get('src')
1903             if not src or src in urls:
1904                 continue
1905             urls.append(src)
1906             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1907             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1908             subtitles.setdefault(lang, []).append({
1909                 'url': src,
1910                 'ext': ext,
1911             })
1912         return subtitles
1913
1914     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1915         xspf = self._download_xml(
1916             xspf_url, playlist_id, 'Downloading xpsf playlist',
1917             'Unable to download xspf manifest', fatal=fatal)
1918         if xspf is False:
1919             return []
1920         return self._parse_xspf(
1921             xspf, playlist_id, xspf_url=xspf_url,
1922             xspf_base_url=base_url(xspf_url))
1923
1924     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1925         NS_MAP = {
1926             'xspf': 'http://xspf.org/ns/0/',
1927             's1': 'http://static.streamone.nl/player/ns/0',
1928         }
1929
1930         entries = []
1931         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1932             title = xpath_text(
1933                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1934             description = xpath_text(
1935                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1936             thumbnail = xpath_text(
1937                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1938             duration = float_or_none(
1939                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1940
1941             formats = []
1942             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1943                 format_url = urljoin(xspf_base_url, location.text)
1944                 if not format_url:
1945                     continue
1946                 formats.append({
1947                     'url': format_url,
1948                     'manifest_url': xspf_url,
1949                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1950                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1951                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1952                 })
1953             self._sort_formats(formats)
1954
1955             entries.append({
1956                 'id': playlist_id,
1957                 'title': title,
1958                 'description': description,
1959                 'thumbnail': thumbnail,
1960                 'duration': duration,
1961                 'formats': formats,
1962             })
1963         return entries
1964
1965     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1966         res = self._download_xml_handle(
1967             mpd_url, video_id,
1968             note=note or 'Downloading MPD manifest',
1969             errnote=errnote or 'Failed to download MPD manifest',
1970             fatal=fatal)
1971         if res is False:
1972             return []
1973         mpd_doc, urlh = res
1974         mpd_base_url = base_url(urlh.geturl())
1975
1976         return self._parse_mpd_formats(
1977             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1978             formats_dict=formats_dict, mpd_url=mpd_url)
1979
1980     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1981         """
1982         Parse formats from MPD manifest.
1983         References:
1984          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1985             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1986          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1987         """
1988         if mpd_doc.get('type') == 'dynamic':
1989             return []
1990
1991         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1992
1993         def _add_ns(path):
1994             return self._xpath_ns(path, namespace)
1995
1996         def is_drm_protected(element):
1997             return element.find(_add_ns('ContentProtection')) is not None
1998
1999         def extract_multisegment_info(element, ms_parent_info):
2000             ms_info = ms_parent_info.copy()
2001
2002             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2003             # common attributes and elements.  We will only extract relevant
2004             # for us.
2005             def extract_common(source):
2006                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2007                 if segment_timeline is not None:
2008                     s_e = segment_timeline.findall(_add_ns('S'))
2009                     if s_e:
2010                         ms_info['total_number'] = 0
2011                         ms_info['s'] = []
2012                         for s in s_e:
2013                             r = int(s.get('r', 0))
2014                             ms_info['total_number'] += 1 + r
2015                             ms_info['s'].append({
2016                                 't': int(s.get('t', 0)),
2017                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2018                                 'd': int(s.attrib['d']),
2019                                 'r': r,
2020                             })
2021                 start_number = source.get('startNumber')
2022                 if start_number:
2023                     ms_info['start_number'] = int(start_number)
2024                 timescale = source.get('timescale')
2025                 if timescale:
2026                     ms_info['timescale'] = int(timescale)
2027                 segment_duration = source.get('duration')
2028                 if segment_duration:
2029                     ms_info['segment_duration'] = float(segment_duration)
2030
2031             def extract_Initialization(source):
2032                 initialization = source.find(_add_ns('Initialization'))
2033                 if initialization is not None:
2034                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2035
2036             segment_list = element.find(_add_ns('SegmentList'))
2037             if segment_list is not None:
2038                 extract_common(segment_list)
2039                 extract_Initialization(segment_list)
2040                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2041                 if segment_urls_e:
2042                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2043             else:
2044                 segment_template = element.find(_add_ns('SegmentTemplate'))
2045                 if segment_template is not None:
2046                     extract_common(segment_template)
2047                     media = segment_template.get('media')
2048                     if media:
2049                         ms_info['media'] = media
2050                     initialization = segment_template.get('initialization')
2051                     if initialization:
2052                         ms_info['initialization'] = initialization
2053                     else:
2054                         extract_Initialization(segment_template)
2055             return ms_info
2056
2057         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2058         formats = []
2059         for period in mpd_doc.findall(_add_ns('Period')):
2060             period_duration = parse_duration(period.get('duration')) or mpd_duration
2061             period_ms_info = extract_multisegment_info(period, {
2062                 'start_number': 1,
2063                 'timescale': 1,
2064             })
2065             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2066                 if is_drm_protected(adaptation_set):
2067                     continue
2068                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2069                 for representation in adaptation_set.findall(_add_ns('Representation')):
2070                     if is_drm_protected(representation):
2071                         continue
2072                     representation_attrib = adaptation_set.attrib.copy()
2073                     representation_attrib.update(representation.attrib)
2074                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2075                     mime_type = representation_attrib['mimeType']
2076                     content_type = mime_type.split('/')[0]
2077                     if content_type == 'text':
2078                         # TODO implement WebVTT downloading
2079                         pass
2080                     elif content_type in ('video', 'audio'):
2081                         base_url = ''
2082                         for element in (representation, adaptation_set, period, mpd_doc):
2083                             base_url_e = element.find(_add_ns('BaseURL'))
2084                             if base_url_e is not None:
2085                                 base_url = base_url_e.text + base_url
2086                                 if re.match(r'^https?://', base_url):
2087                                     break
2088                         if mpd_base_url and not re.match(r'^https?://', base_url):
2089                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2090                                 mpd_base_url += '/'
2091                             base_url = mpd_base_url + base_url
2092                         representation_id = representation_attrib.get('id')
2093                         lang = representation_attrib.get('lang')
2094                         url_el = representation.find(_add_ns('BaseURL'))
2095                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2096                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2097                         f = {
2098                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2099                             'url': base_url,
2100                             'manifest_url': mpd_url,
2101                             'ext': mimetype2ext(mime_type),
2102                             'width': int_or_none(representation_attrib.get('width')),
2103                             'height': int_or_none(representation_attrib.get('height')),
2104                             'tbr': float_or_none(bandwidth, 1000),
2105                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2106                             'fps': int_or_none(representation_attrib.get('frameRate')),
2107                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2108                             'format_note': 'DASH %s' % content_type,
2109                             'filesize': filesize,
2110                             'container': mimetype2ext(mime_type) + '_dash',
2111                         }
2112                         f.update(parse_codecs(representation_attrib.get('codecs')))
2113                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2114
2115                         def prepare_template(template_name, identifiers):
2116                             tmpl = representation_ms_info[template_name]
2117                             # First of, % characters outside $...$ templates
2118                             # must be escaped by doubling for proper processing
2119                             # by % operator string formatting used further (see
2120                             # https://github.com/rg3/youtube-dl/issues/16867).
2121                             t = ''
2122                             in_template = False
2123                             for c in tmpl:
2124                                 t += c
2125                                 if c == '$':
2126                                     in_template = not in_template
2127                                 elif c == '%' and not in_template:
2128                                     t += c
2129                             # Next, $...$ templates are translated to their
2130                             # %(...) counterparts to be used with % operator
2131                             t = t.replace('$RepresentationID$', representation_id)
2132                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2133                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2134                             t.replace('$$', '$')
2135                             return t
2136
2137                         # @initialization is a regular template like @media one
2138                         # so it should be handled just the same way (see
2139                         # https://github.com/rg3/youtube-dl/issues/11605)
2140                         if 'initialization' in representation_ms_info:
2141                             initialization_template = prepare_template(
2142                                 'initialization',
2143                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2144                                 # $Time$ shall not be included for @initialization thus
2145                                 # only $Bandwidth$ remains
2146                                 ('Bandwidth', ))
2147                             representation_ms_info['initialization_url'] = initialization_template % {
2148                                 'Bandwidth': bandwidth,
2149                             }
2150
2151                         def location_key(location):
2152                             return 'url' if re.match(r'^https?://', location) else 'path'
2153
2154                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2155
2156                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2157                             media_location_key = location_key(media_template)
2158
2159                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2160                             # can't be used at the same time
2161                             if '%(Number' in media_template and 's' not in representation_ms_info:
2162                                 segment_duration = None
2163                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2164                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2165                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2166                                 representation_ms_info['fragments'] = [{
2167                                     media_location_key: media_template % {
2168                                         'Number': segment_number,
2169                                         'Bandwidth': bandwidth,
2170                                     },
2171                                     'duration': segment_duration,
2172                                 } for segment_number in range(
2173                                     representation_ms_info['start_number'],
2174                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2175                             else:
2176                                 # $Number*$ or $Time$ in media template with S list available
2177                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2178                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2179                                 representation_ms_info['fragments'] = []
2180                                 segment_time = 0
2181                                 segment_d = None
2182                                 segment_number = representation_ms_info['start_number']
2183
2184                                 def add_segment_url():
2185                                     segment_url = media_template % {
2186                                         'Time': segment_time,
2187                                         'Bandwidth': bandwidth,
2188                                         'Number': segment_number,
2189                                     }
2190                                     representation_ms_info['fragments'].append({
2191                                         media_location_key: segment_url,
2192                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2193                                     })
2194
2195                                 for num, s in enumerate(representation_ms_info['s']):
2196                                     segment_time = s.get('t') or segment_time
2197                                     segment_d = s['d']
2198                                     add_segment_url()
2199                                     segment_number += 1
2200                                     for r in range(s.get('r', 0)):
2201                                         segment_time += segment_d
2202                                         add_segment_url()
2203                                         segment_number += 1
2204                                     segment_time += segment_d
2205                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2206                             # No media template
2207                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2208                             # or any YouTube dashsegments video
2209                             fragments = []
2210                             segment_index = 0
2211                             timescale = representation_ms_info['timescale']
2212                             for s in representation_ms_info['s']:
2213                                 duration = float_or_none(s['d'], timescale)
2214                                 for r in range(s.get('r', 0) + 1):
2215                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2216                                     fragments.append({
2217                                         location_key(segment_uri): segment_uri,
2218                                         'duration': duration,
2219                                     })
2220                                     segment_index += 1
2221                             representation_ms_info['fragments'] = fragments
2222                         elif 'segment_urls' in representation_ms_info:
2223                             # Segment URLs with no SegmentTimeline
2224                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2225                             # https://github.com/rg3/youtube-dl/pull/14844
2226                             fragments = []
2227                             segment_duration = float_or_none(
2228                                 representation_ms_info['segment_duration'],
2229                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2230                             for segment_url in representation_ms_info['segment_urls']:
2231                                 fragment = {
2232                                     location_key(segment_url): segment_url,
2233                                 }
2234                                 if segment_duration:
2235                                     fragment['duration'] = segment_duration
2236                                 fragments.append(fragment)
2237                             representation_ms_info['fragments'] = fragments
2238                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2239                         # No fragments key is present in this case.
2240                         if 'fragments' in representation_ms_info:
2241                             f.update({
2242                                 'fragment_base_url': base_url,
2243                                 'fragments': [],
2244                                 'protocol': 'http_dash_segments',
2245                             })
2246                             if 'initialization_url' in representation_ms_info:
2247                                 initialization_url = representation_ms_info['initialization_url']
2248                                 if not f.get('url'):
2249                                     f['url'] = initialization_url
2250                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2251                             f['fragments'].extend(representation_ms_info['fragments'])
2252                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2253                         # is not necessarily unique within a Period thus formats with
2254                         # the same `format_id` are quite possible. There are numerous examples
2255                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2256                         # https://github.com/rg3/youtube-dl/issues/13919)
2257                         full_info = formats_dict.get(representation_id, {}).copy()
2258                         full_info.update(f)
2259                         formats.append(full_info)
2260                     else:
2261                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2262         return formats
2263
2264     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2265         res = self._download_xml_handle(
2266             ism_url, video_id,
2267             note=note or 'Downloading ISM manifest',
2268             errnote=errnote or 'Failed to download ISM manifest',
2269             fatal=fatal)
2270         if res is False:
2271             return []
2272         ism_doc, urlh = res
2273
2274         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2275
2276     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2277         """
2278         Parse formats from ISM manifest.
2279         References:
2280          1. [MS-SSTR]: Smooth Streaming Protocol,
2281             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2282         """
2283         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2284             return []
2285
2286         duration = int(ism_doc.attrib['Duration'])
2287         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2288
2289         formats = []
2290         for stream in ism_doc.findall('StreamIndex'):
2291             stream_type = stream.get('Type')
2292             if stream_type not in ('video', 'audio'):
2293                 continue
2294             url_pattern = stream.attrib['Url']
2295             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2296             stream_name = stream.get('Name')
2297             for track in stream.findall('QualityLevel'):
2298                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2299                 # TODO: add support for WVC1 and WMAP
2300                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2301                     self.report_warning('%s is not a supported codec' % fourcc)
2302                     continue
2303                 tbr = int(track.attrib['Bitrate']) // 1000
2304                 # [1] does not mention Width and Height attributes. However,
2305                 # they're often present while MaxWidth and MaxHeight are
2306                 # missing, so should be used as fallbacks
2307                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2308                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2309                 sampling_rate = int_or_none(track.get('SamplingRate'))
2310
2311                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2312                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2313
2314                 fragments = []
2315                 fragment_ctx = {
2316                     'time': 0,
2317                 }
2318                 stream_fragments = stream.findall('c')
2319                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2320                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2321                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2322                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2323                     if not fragment_ctx['duration']:
2324                         try:
2325                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2326                         except IndexError:
2327                             next_fragment_time = duration
2328                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2329                     for _ in range(fragment_repeat):
2330                         fragments.append({
2331                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2332                             'duration': fragment_ctx['duration'] / stream_timescale,
2333                         })
2334                         fragment_ctx['time'] += fragment_ctx['duration']
2335
2336                 format_id = []
2337                 if ism_id:
2338                     format_id.append(ism_id)
2339                 if stream_name:
2340                     format_id.append(stream_name)
2341                 format_id.append(compat_str(tbr))
2342
2343                 formats.append({
2344                     'format_id': '-'.join(format_id),
2345                     'url': ism_url,
2346                     'manifest_url': ism_url,
2347                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2348                     'width': width,
2349                     'height': height,
2350                     'tbr': tbr,
2351                     'asr': sampling_rate,
2352                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2353                     'acodec': 'none' if stream_type == 'video' else fourcc,
2354                     'protocol': 'ism',
2355                     'fragments': fragments,
2356                     '_download_params': {
2357                         'duration': duration,
2358                         'timescale': stream_timescale,
2359                         'width': width or 0,
2360                         'height': height or 0,
2361                         'fourcc': fourcc,
2362                         'codec_private_data': track.get('CodecPrivateData'),
2363                         'sampling_rate': sampling_rate,
2364                         'channels': int_or_none(track.get('Channels', 2)),
2365                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2366                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2367                     },
2368                 })
2369         return formats
2370
2371     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2372         def absolute_url(item_url):
2373             return urljoin(base_url, item_url)
2374
2375         def parse_content_type(content_type):
2376             if not content_type:
2377                 return {}
2378             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2379             if ctr:
2380                 mimetype, codecs = ctr.groups()
2381                 f = parse_codecs(codecs)
2382                 f['ext'] = mimetype2ext(mimetype)
2383                 return f
2384             return {}
2385
2386         def _media_formats(src, cur_media_type, type_info={}):
2387             full_url = absolute_url(src)
2388             ext = type_info.get('ext') or determine_ext(full_url)
2389             if ext == 'm3u8':
2390                 is_plain_url = False
2391                 formats = self._extract_m3u8_formats(
2392                     full_url, video_id, ext='mp4',
2393                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2394                     preference=preference, fatal=False)
2395             elif ext == 'mpd':
2396                 is_plain_url = False
2397                 formats = self._extract_mpd_formats(
2398                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2399             else:
2400                 is_plain_url = True
2401                 formats = [{
2402                     'url': full_url,
2403                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2404                 }]
2405             return is_plain_url, formats
2406
2407         entries = []
2408         # amp-video and amp-audio are very similar to their HTML5 counterparts
2409         # so we wll include them right here (see
2410         # https://www.ampproject.org/docs/reference/components/amp-video)
2411         media_tags = [(media_tag, media_type, '')
2412                       for media_tag, media_type
2413                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2414         media_tags.extend(re.findall(
2415             # We only allow video|audio followed by a whitespace or '>'.
2416             # Allowing more characters may end up in significant slow down (see
2417             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2418             # http://www.porntrex.com/maps/videositemap.xml).
2419             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2420         for media_tag, media_type, media_content in media_tags:
2421             media_info = {
2422                 'formats': [],
2423                 'subtitles': {},
2424             }
2425             media_attributes = extract_attributes(media_tag)
2426             src = media_attributes.get('src')
2427             if src:
2428                 _, formats = _media_formats(src, media_type)
2429                 media_info['formats'].extend(formats)
2430             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2431             if media_content:
2432                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2433                     source_attributes = extract_attributes(source_tag)
2434                     src = source_attributes.get('src')
2435                     if not src:
2436                         continue
2437                     f = parse_content_type(source_attributes.get('type'))
2438                     is_plain_url, formats = _media_formats(src, media_type, f)
2439                     if is_plain_url:
2440                         # res attribute is not standard but seen several times
2441                         # in the wild
2442                         f.update({
2443                             'height': int_or_none(source_attributes.get('res')),
2444                             'format_id': source_attributes.get('label'),
2445                         })
2446                         f.update(formats[0])
2447                         media_info['formats'].append(f)
2448                     else:
2449                         media_info['formats'].extend(formats)
2450                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2451                     track_attributes = extract_attributes(track_tag)
2452                     kind = track_attributes.get('kind')
2453                     if not kind or kind in ('subtitles', 'captions'):
2454                         src = track_attributes.get('src')
2455                         if not src:
2456                             continue
2457                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2458                         media_info['subtitles'].setdefault(lang, []).append({
2459                             'url': absolute_url(src),
2460                         })
2461             for f in media_info['formats']:
2462                 f.setdefault('http_headers', {})['Referer'] = base_url
2463             if media_info['formats'] or media_info['subtitles']:
2464                 entries.append(media_info)
2465         return entries
2466
2467     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2468         formats = []
2469         hdcore_sign = 'hdcore=3.7.0'
2470         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2471         hds_host = hosts.get('hds')
2472         if hds_host:
2473             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2474         if 'hdcore=' not in f4m_url:
2475             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2476         f4m_formats = self._extract_f4m_formats(
2477             f4m_url, video_id, f4m_id='hds', fatal=False)
2478         for entry in f4m_formats:
2479             entry.update({'extra_param_to_segment_url': hdcore_sign})
2480         formats.extend(f4m_formats)
2481         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2482         hls_host = hosts.get('hls')
2483         if hls_host:
2484             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2485         formats.extend(self._extract_m3u8_formats(
2486             m3u8_url, video_id, 'mp4', 'm3u8_native',
2487             m3u8_id='hls', fatal=False))
2488         return formats
2489
2490     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2491         query = compat_urlparse.urlparse(url).query
2492         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2493         mobj = re.search(
2494             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2495         url_base = mobj.group('url')
2496         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2497         formats = []
2498
2499         def manifest_url(manifest):
2500             m_url = '%s/%s' % (http_base_url, manifest)
2501             if query:
2502                 m_url += '?%s' % query
2503             return m_url
2504
2505         if 'm3u8' not in skip_protocols:
2506             formats.extend(self._extract_m3u8_formats(
2507                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2508                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2509         if 'f4m' not in skip_protocols:
2510             formats.extend(self._extract_f4m_formats(
2511                 manifest_url('manifest.f4m'),
2512                 video_id, f4m_id='hds', fatal=False))
2513         if 'dash' not in skip_protocols:
2514             formats.extend(self._extract_mpd_formats(
2515                 manifest_url('manifest.mpd'),
2516                 video_id, mpd_id='dash', fatal=False))
2517         if re.search(r'(?:/smil:|\.smil)', url_base):
2518             if 'smil' not in skip_protocols:
2519                 rtmp_formats = self._extract_smil_formats(
2520                     manifest_url('jwplayer.smil'),
2521                     video_id, fatal=False)
2522                 for rtmp_format in rtmp_formats:
2523                     rtsp_format = rtmp_format.copy()
2524                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2525                     del rtsp_format['play_path']
2526                     del rtsp_format['ext']
2527                     rtsp_format.update({
2528                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2529                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2530                         'protocol': 'rtsp',
2531                     })
2532                     formats.extend([rtmp_format, rtsp_format])
2533         else:
2534             for protocol in ('rtmp', 'rtsp'):
2535                 if protocol not in skip_protocols:
2536                     formats.append({
2537                         'url': '%s:%s' % (protocol, url_base),
2538                         'format_id': protocol,
2539                         'protocol': protocol,
2540                     })
2541         return formats
2542
2543     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2544         mobj = re.search(
2545             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2546             webpage)
2547         if mobj:
2548             try:
2549                 jwplayer_data = self._parse_json(mobj.group('options'),
2550                                                  video_id=video_id,
2551                                                  transform_source=transform_source)
2552             except ExtractorError:
2553                 pass
2554             else:
2555                 if isinstance(jwplayer_data, dict):
2556                     return jwplayer_data
2557
2558     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2559         jwplayer_data = self._find_jwplayer_data(
2560             webpage, video_id, transform_source=js_to_json)
2561         return self._parse_jwplayer_data(
2562             jwplayer_data, video_id, *args, **kwargs)
2563
2564     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2565                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2566         # JWPlayer backward compatibility: flattened playlists
2567         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2568         if 'playlist' not in jwplayer_data:
2569             jwplayer_data = {'playlist': [jwplayer_data]}
2570
2571         entries = []
2572
2573         # JWPlayer backward compatibility: single playlist item
2574         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2575         if not isinstance(jwplayer_data['playlist'], list):
2576             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2577
2578         for video_data in jwplayer_data['playlist']:
2579             # JWPlayer backward compatibility: flattened sources
2580             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2581             if 'sources' not in video_data:
2582                 video_data['sources'] = [video_data]
2583
2584             this_video_id = video_id or video_data['mediaid']
2585
2586             formats = self._parse_jwplayer_formats(
2587                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2588                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2589
2590             subtitles = {}
2591             tracks = video_data.get('tracks')
2592             if tracks and isinstance(tracks, list):
2593                 for track in tracks:
2594                     if not isinstance(track, dict):
2595                         continue
2596                     track_kind = track.get('kind')
2597                     if not track_kind or not isinstance(track_kind, compat_str):
2598                         continue
2599                     if track_kind.lower() not in ('captions', 'subtitles'):
2600                         continue
2601                     track_url = urljoin(base_url, track.get('file'))
2602                     if not track_url:
2603                         continue
2604                     subtitles.setdefault(track.get('label') or 'en', []).append({
2605                         'url': self._proto_relative_url(track_url)
2606                     })
2607
2608             entry = {
2609                 'id': this_video_id,
2610                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2611                 'description': video_data.get('description'),
2612                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2613                 'timestamp': int_or_none(video_data.get('pubdate')),
2614                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2615                 'subtitles': subtitles,
2616             }
2617             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2618             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2619                 entry.update({
2620                     '_type': 'url_transparent',
2621                     'url': formats[0]['url'],
2622                 })
2623             else:
2624                 self._sort_formats(formats)
2625                 entry['formats'] = formats
2626             entries.append(entry)
2627         if len(entries) == 1:
2628             return entries[0]
2629         else:
2630             return self.playlist_result(entries)
2631
2632     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2633                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2634         urls = []
2635         formats = []
2636         for source in jwplayer_sources_data:
2637             if not isinstance(source, dict):
2638                 continue
2639             source_url = self._proto_relative_url(source.get('file'))
2640             if not source_url:
2641                 continue
2642             if base_url:
2643                 source_url = compat_urlparse.urljoin(base_url, source_url)
2644             if source_url in urls:
2645                 continue
2646             urls.append(source_url)
2647             source_type = source.get('type') or ''
2648             ext = mimetype2ext(source_type) or determine_ext(source_url)
2649             if source_type == 'hls' or ext == 'm3u8':
2650                 formats.extend(self._extract_m3u8_formats(
2651                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2652                     m3u8_id=m3u8_id, fatal=False))
2653             elif source_type == 'dash' or ext == 'mpd':
2654                 formats.extend(self._extract_mpd_formats(
2655                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2656             elif ext == 'smil':
2657                 formats.extend(self._extract_smil_formats(
2658                     source_url, video_id, fatal=False))
2659             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2660             elif source_type.startswith('audio') or ext in (
2661                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2662                 formats.append({
2663                     'url': source_url,
2664                     'vcodec': 'none',
2665                     'ext': ext,
2666                 })
2667             else:
2668                 height = int_or_none(source.get('height'))
2669                 if height is None:
2670                     # Often no height is provided but there is a label in
2671                     # format like "1080p", "720p SD", or 1080.
2672                     height = int_or_none(self._search_regex(
2673                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2674                         'height', default=None))
2675                 a_format = {
2676                     'url': source_url,
2677                     'width': int_or_none(source.get('width')),
2678                     'height': height,
2679                     'tbr': int_or_none(source.get('bitrate')),
2680                     'ext': ext,
2681                 }
2682                 if source_url.startswith('rtmp'):
2683                     a_format['ext'] = 'flv'
2684                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2685                     # of jwplayer.flash.swf
2686                     rtmp_url_parts = re.split(
2687                         r'((?:mp4|mp3|flv):)', source_url, 1)
2688                     if len(rtmp_url_parts) == 3:
2689                         rtmp_url, prefix, play_path = rtmp_url_parts
2690                         a_format.update({
2691                             'url': rtmp_url,
2692                             'play_path': prefix + play_path,
2693                         })
2694                     if rtmp_params:
2695                         a_format.update(rtmp_params)
2696                 formats.append(a_format)
2697         return formats
2698
2699     def _live_title(self, name):
2700         """ Generate the title for a live video """
2701         now = datetime.datetime.now()
2702         now_str = now.strftime('%Y-%m-%d %H:%M')
2703         return name + ' ' + now_str
2704
2705     def _int(self, v, name, fatal=False, **kwargs):
2706         res = int_or_none(v, **kwargs)
2707         if 'get_attr' in kwargs:
2708             print(getattr(v, kwargs['get_attr']))
2709         if res is None:
2710             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2711             if fatal:
2712                 raise ExtractorError(msg)
2713             else:
2714                 self._downloader.report_warning(msg)
2715         return res
2716
2717     def _float(self, v, name, fatal=False, **kwargs):
2718         res = float_or_none(v, **kwargs)
2719         if res is None:
2720             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2721             if fatal:
2722                 raise ExtractorError(msg)
2723             else:
2724                 self._downloader.report_warning(msg)
2725         return res
2726
2727     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2728                     path='/', secure=False, discard=False, rest={}, **kwargs):
2729         cookie = compat_cookiejar.Cookie(
2730             0, name, value, port, port is not None, domain, True,
2731             domain.startswith('.'), path, True, secure, expire_time,
2732             discard, None, None, rest)
2733         self._downloader.cookiejar.set_cookie(cookie)
2734
2735     def _get_cookies(self, url):
2736         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2737         req = sanitized_Request(url)
2738         self._downloader.cookiejar.add_cookie_header(req)
2739         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2740
2741     def get_testcases(self, include_onlymatching=False):
2742         t = getattr(self, '_TEST', None)
2743         if t:
2744             assert not hasattr(self, '_TESTS'), \
2745                 '%s has _TEST and _TESTS' % type(self).__name__
2746             tests = [t]
2747         else:
2748             tests = getattr(self, '_TESTS', [])
2749         for t in tests:
2750             if not include_onlymatching and t.get('only_matching', False):
2751                 continue
2752             t['name'] = type(self).__name__[:-len('IE')]
2753             yield t
2754
2755     def is_suitable(self, age_limit):
2756         """ Test whether the extractor is generally suitable for the given
2757         age limit (i.e. pornographic sites are not, all others usually are) """
2758
2759         any_restricted = False
2760         for tc in self.get_testcases(include_onlymatching=False):
2761             if tc.get('playlist', []):
2762                 tc = tc['playlist'][0]
2763             is_restricted = age_restricted(
2764                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2765             if not is_restricted:
2766                 return True
2767             any_restricted = any_restricted or is_restricted
2768         return not any_restricted
2769
2770     def extract_subtitles(self, *args, **kwargs):
2771         if (self._downloader.params.get('writesubtitles', False) or
2772                 self._downloader.params.get('listsubtitles')):
2773             return self._get_subtitles(*args, **kwargs)
2774         return {}
2775
2776     def _get_subtitles(self, *args, **kwargs):
2777         raise NotImplementedError('This method must be implemented by subclasses')
2778
2779     @staticmethod
2780     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2781         """ Merge subtitle items for one language. Items with duplicated URLs
2782         will be dropped. """
2783         list1_urls = set([item['url'] for item in subtitle_list1])
2784         ret = list(subtitle_list1)
2785         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2786         return ret
2787
2788     @classmethod
2789     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2790         """ Merge two subtitle dictionaries, language by language. """
2791         ret = dict(subtitle_dict1)
2792         for lang in subtitle_dict2:
2793             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2794         return ret
2795
2796     def extract_automatic_captions(self, *args, **kwargs):
2797         if (self._downloader.params.get('writeautomaticsub', False) or
2798                 self._downloader.params.get('listsubtitles')):
2799             return self._get_automatic_captions(*args, **kwargs)
2800         return {}
2801
2802     def _get_automatic_captions(self, *args, **kwargs):
2803         raise NotImplementedError('This method must be implemented by subclasses')
2804
2805     def mark_watched(self, *args, **kwargs):
2806         if (self._downloader.params.get('mark_watched', False) and
2807                 (self._get_login_info()[0] is not None or
2808                     self._downloader.params.get('cookiefile') is not None)):
2809             self._mark_watched(*args, **kwargs)
2810
2811     def _mark_watched(self, *args, **kwargs):
2812         raise NotImplementedError('This method must be implemented by subclasses')
2813
2814     def geo_verification_headers(self):
2815         headers = {}
2816         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2817         if geo_verification_proxy:
2818             headers['Ytdl-request-proxy'] = geo_verification_proxy
2819         return headers
2820
2821     def _generic_id(self, url):
2822         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2823
2824     def _generic_title(self, url):
2825         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2826
2827
2828 class SearchInfoExtractor(InfoExtractor):
2829     """
2830     Base class for paged search queries extractors.
2831     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2832     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2833     """
2834
2835     @classmethod
2836     def _make_valid_url(cls):
2837         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2838
2839     @classmethod
2840     def suitable(cls, url):
2841         return re.match(cls._make_valid_url(), url) is not None
2842
2843     def _real_extract(self, query):
2844         mobj = re.match(self._make_valid_url(), query)
2845         if mobj is None:
2846             raise ExtractorError('Invalid search query "%s"' % query)
2847
2848         prefix = mobj.group('prefix')
2849         query = mobj.group('query')
2850         if prefix == '':
2851             return self._get_n_results(query, 1)
2852         elif prefix == 'all':
2853             return self._get_n_results(query, self._MAX_RESULTS)
2854         else:
2855             n = int(prefix)
2856             if n <= 0:
2857                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2858             elif n > self._MAX_RESULTS:
2859                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2860                 n = self._MAX_RESULTS
2861             return self._get_n_results(query, n)
2862
2863     def _get_n_results(self, query, n):
2864         """Get a specified number of results for a query"""
2865         raise NotImplementedError('This method must be implemented by subclasses')
2866
2867     @property
2868     def SEARCH_KEY(self):
2869         return self._SEARCH_KEY