[extractor/common] Clarify url and manifest_url meta fields
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     url_or_none,
73     xpath_element,
74     xpath_text,
75     xpath_with_ns,
76 )
77
78
79 class InfoExtractor(object):
80     """Information Extractor class.
81
82     Information extractors are the classes that, given a URL, extract
83     information about the video (or videos) the URL refers to. This
84     information includes the real video URL, the video title, author and
85     others. The information is stored in a dictionary which is then
86     passed to the YoutubeDL. The YoutubeDL processes this
87     information possibly downloading the video to the file system, among
88     other possible outcomes.
89
90     The type field determines the type of the result.
91     By far the most common value (and the default if _type is missing) is
92     "video", which indicates a single video.
93
94     For a video, the dictionaries must include the following fields:
95
96     id:             Video identifier.
97     title:          Video title, unescaped.
98
99     Additionally, it must contain either a formats entry or a url one:
100
101     formats:        A list of dictionaries for each format available, ordered
102                     from worst to best quality.
103
104                     Potential fields:
105                     * url        The mandatory URL representing the media:
106                                    for plain file media - HTTP URL of this file,
107                                    for RTMP - RTMP URL,
108                                    for HLS - URL of the M3U8 media playlist,
109                                    for HDS - URL of the F4M manifest,
110                                    for DASH - URL of the MPD manifest,
111                                    for MSS - URL of the ISM manifest.
112                     * manifest_url
113                                  The URL of the manifest file in case of
114                                  fragmented media:
115                                    for HLS - URL of the M3U8 master playlist,
116                                    for HDS - URL of the F4M manifest,
117                                    for DASH - URL of the MPD manifest,
118                                    for MSS - URL of the ISM manifest.
119                     * ext        Will be calculated from URL if missing
120                     * format     A human-readable description of the format
121                                  ("mp4 container with h264/opus").
122                                  Calculated from the format_id, width, height.
123                                  and format_note fields if missing.
124                     * format_id  A short description of the format
125                                  ("mp4_h264_opus" or "19").
126                                 Technically optional, but strongly recommended.
127                     * format_note Additional info about the format
128                                  ("3D" or "DASH video")
129                     * width      Width of the video, if known
130                     * height     Height of the video, if known
131                     * resolution Textual description of width and height
132                     * tbr        Average bitrate of audio and video in KBit/s
133                     * abr        Average audio bitrate in KBit/s
134                     * acodec     Name of the audio codec in use
135                     * asr        Audio sampling rate in Hertz
136                     * vbr        Average video bitrate in KBit/s
137                     * fps        Frame rate
138                     * vcodec     Name of the video codec in use
139                     * container  Name of the container format
140                     * filesize   The number of bytes, if known in advance
141                     * filesize_approx  An estimate for the number of bytes
142                     * player_url SWF Player URL (used for rtmpdump).
143                     * protocol   The protocol that will be used for the actual
144                                  download, lower-case.
145                                  "http", "https", "rtsp", "rtmp", "rtmpe",
146                                  "m3u8", "m3u8_native" or "http_dash_segments".
147                     * fragment_base_url
148                                  Base URL for fragments. Each fragment's path
149                                  value (if present) will be relative to
150                                  this URL.
151                     * fragments  A list of fragments of a fragmented media.
152                                  Each fragment entry must contain either an url
153                                  or a path. If an url is present it should be
154                                  considered by a client. Otherwise both path and
155                                  fragment_base_url must be present. Here is
156                                  the list of all potential fields:
157                                  * "url" - fragment's URL
158                                  * "path" - fragment's path relative to
159                                             fragment_base_url
160                                  * "duration" (optional, int or float)
161                                  * "filesize" (optional, int)
162                     * preference Order number of this format. If this field is
163                                  present and not None, the formats get sorted
164                                  by this field, regardless of all other values.
165                                  -1 for default (order by other properties),
166                                  -2 or smaller for less than default.
167                                  < -1000 to hide the format (if there is
168                                     another one which is strictly better)
169                     * language   Language code, e.g. "de" or "en-US".
170                     * language_preference  Is this in the language mentioned in
171                                  the URL?
172                                  10 if it's what the URL is about,
173                                  -1 for default (don't know),
174                                  -10 otherwise, other values reserved for now.
175                     * quality    Order number of the video quality of this
176                                  format, irrespective of the file format.
177                                  -1 for default (order by other properties),
178                                  -2 or smaller for less than default.
179                     * source_preference  Order number for this video source
180                                   (quality takes higher priority)
181                                  -1 for default (order by other properties),
182                                  -2 or smaller for less than default.
183                     * http_headers  A dictionary of additional HTTP headers
184                                  to add to the request.
185                     * stretched_ratio  If given and not 1, indicates that the
186                                  video's pixels are not square.
187                                  width : height ratio as float.
188                     * no_resume  The server does not support resuming the
189                                  (HTTP or RTMP) download. Boolean.
190                     * downloader_options  A dictionary of downloader options as
191                                  described in FileDownloader
192
193     url:            Final video URL.
194     ext:            Video filename extension.
195     format:         The video format, defaults to ext (used for --get-format)
196     player_url:     SWF Player URL (used for rtmpdump).
197
198     The following fields are optional:
199
200     alt_title:      A secondary title of the video.
201     display_id      An alternative identifier for the video, not necessarily
202                     unique, but available before title. Typically, id is
203                     something like "4234987", title "Dancing naked mole rats",
204                     and display_id "dancing-naked-mole-rats"
205     thumbnails:     A list of dictionaries, with the following entries:
206                         * "id" (optional, string) - Thumbnail format ID
207                         * "url"
208                         * "preference" (optional, int) - quality of the image
209                         * "width" (optional, int)
210                         * "height" (optional, int)
211                         * "resolution" (optional, string "{width}x{height"},
212                                         deprecated)
213                         * "filesize" (optional, int)
214     thumbnail:      Full URL to a video thumbnail image.
215     description:    Full video description.
216     uploader:       Full name of the video uploader.
217     license:        License name the video is licensed under.
218     creator:        The creator of the video.
219     release_date:   The date (YYYYMMDD) when the video was released.
220     timestamp:      UNIX timestamp of the moment the video became available.
221     upload_date:    Video upload date (YYYYMMDD).
222                     If not explicitly set, calculated from timestamp.
223     uploader_id:    Nickname or id of the video uploader.
224     uploader_url:   Full URL to a personal webpage of the video uploader.
225     channel:        Full name of the channel the video is uploaded on.
226                     Note that channel fields may or may not repeat uploader
227                     fields. This depends on a particular extractor.
228     channel_id:     Id of the channel.
229     channel_url:    Full URL to a channel webpage.
230     location:       Physical location where the video was filmed.
231     subtitles:      The available subtitles as a dictionary in the format
232                     {tag: subformats}. "tag" is usually a language code, and
233                     "subformats" is a list sorted from lower to higher
234                     preference, each element is a dictionary with the "ext"
235                     entry and one of:
236                         * "data": The subtitles file contents
237                         * "url": A URL pointing to the subtitles file
238                     "ext" will be calculated from URL if missing
239     automatic_captions: Like 'subtitles', used by the YoutubeIE for
240                     automatically generated captions
241     duration:       Length of the video in seconds, as an integer or float.
242     view_count:     How many users have watched the video on the platform.
243     like_count:     Number of positive ratings of the video
244     dislike_count:  Number of negative ratings of the video
245     repost_count:   Number of reposts of the video
246     average_rating: Average rating give by users, the scale used depends on the webpage
247     comment_count:  Number of comments on the video
248     comments:       A list of comments, each with one or more of the following
249                     properties (all but one of text or html optional):
250                         * "author" - human-readable name of the comment author
251                         * "author_id" - user ID of the comment author
252                         * "id" - Comment ID
253                         * "html" - Comment as HTML
254                         * "text" - Plain text of the comment
255                         * "timestamp" - UNIX timestamp of comment
256                         * "parent" - ID of the comment this one is replying to.
257                                      Set to "root" to indicate that this is a
258                                      comment to the original video.
259     age_limit:      Age restriction for the video, as an integer (years)
260     webpage_url:    The URL to the video webpage, if given to youtube-dl it
261                     should allow to get the same result again. (It will be set
262                     by YoutubeDL if it's missing)
263     categories:     A list of categories that the video falls in, for example
264                     ["Sports", "Berlin"]
265     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
266     is_live:        True, False, or None (=unknown). Whether this video is a
267                     live stream that goes on instead of a fixed-length video.
268     start_time:     Time in seconds where the reproduction should start, as
269                     specified in the URL.
270     end_time:       Time in seconds where the reproduction should end, as
271                     specified in the URL.
272     chapters:       A list of dictionaries, with the following entries:
273                         * "start_time" - The start time of the chapter in seconds
274                         * "end_time" - The end time of the chapter in seconds
275                         * "title" (optional, string)
276
277     The following fields should only be used when the video belongs to some logical
278     chapter or section:
279
280     chapter:        Name or title of the chapter the video belongs to.
281     chapter_number: Number of the chapter the video belongs to, as an integer.
282     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
283
284     The following fields should only be used when the video is an episode of some
285     series, programme or podcast:
286
287     series:         Title of the series or programme the video episode belongs to.
288     season:         Title of the season the video episode belongs to.
289     season_number:  Number of the season the video episode belongs to, as an integer.
290     season_id:      Id of the season the video episode belongs to, as a unicode string.
291     episode:        Title of the video episode. Unlike mandatory video title field,
292                     this field should denote the exact title of the video episode
293                     without any kind of decoration.
294     episode_number: Number of the video episode within a season, as an integer.
295     episode_id:     Id of the video episode, as a unicode string.
296
297     The following fields should only be used when the media is a track or a part of
298     a music album:
299
300     track:          Title of the track.
301     track_number:   Number of the track within an album or a disc, as an integer.
302     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
303                     as a unicode string.
304     artist:         Artist(s) of the track.
305     genre:          Genre(s) of the track.
306     album:          Title of the album the track belongs to.
307     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
308     album_artist:   List of all artists appeared on the album (e.g.
309                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
310                     and compilations).
311     disc_number:    Number of the disc or other physical medium the track belongs to,
312                     as an integer.
313     release_year:   Year (YYYY) when the album was released.
314
315     Unless mentioned otherwise, the fields should be Unicode strings.
316
317     Unless mentioned otherwise, None is equivalent to absence of information.
318
319
320     _type "playlist" indicates multiple videos.
321     There must be a key "entries", which is a list, an iterable, or a PagedList
322     object, each element of which is a valid dictionary by this specification.
323
324     Additionally, playlists can have "id", "title", "description", "uploader",
325     "uploader_id", "uploader_url" attributes with the same semantics as videos
326     (see above).
327
328
329     _type "multi_video" indicates that there are multiple videos that
330     form a single show, for examples multiple acts of an opera or TV episode.
331     It must have an entries key like a playlist and contain all the keys
332     required for a video at the same time.
333
334
335     _type "url" indicates that the video must be extracted from another
336     location, possibly by a different extractor. Its only required key is:
337     "url" - the next URL to extract.
338     The key "ie_key" can be set to the class name (minus the trailing "IE",
339     e.g. "Youtube") if the extractor class is known in advance.
340     Additionally, the dictionary may have any properties of the resolved entity
341     known in advance, for example "title" if the title of the referred video is
342     known ahead of time.
343
344
345     _type "url_transparent" entities have the same specification as "url", but
346     indicate that the given additional information is more precise than the one
347     associated with the resolved URL.
348     This is useful when a site employs a video service that hosts the video and
349     its technical metadata, but that video service does not embed a useful
350     title, description etc.
351
352
353     Subclasses of this one should re-define the _real_initialize() and
354     _real_extract() methods and define a _VALID_URL regexp.
355     Probably, they should also be added to the list of extractors.
356
357     _GEO_BYPASS attribute may be set to False in order to disable
358     geo restriction bypass mechanisms for a particular extractor.
359     Though it won't disable explicit geo restriction bypass based on
360     country code provided with geo_bypass_country.
361
362     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
363     countries for this extractor. One of these countries will be used by
364     geo restriction bypass mechanism right away in order to bypass
365     geo restriction, of course, if the mechanism is not disabled.
366
367     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
368     IP blocks in CIDR notation for this extractor. One of these IP blocks
369     will be used by geo restriction bypass mechanism similarly
370     to _GEO_COUNTRIES.
371
372     Finally, the _WORKING attribute should be set to False for broken IEs
373     in order to warn the users and skip the tests.
374     """
375
376     _ready = False
377     _downloader = None
378     _x_forwarded_for_ip = None
379     _GEO_BYPASS = True
380     _GEO_COUNTRIES = None
381     _GEO_IP_BLOCKS = None
382     _WORKING = True
383
384     def __init__(self, downloader=None):
385         """Constructor. Receives an optional downloader."""
386         self._ready = False
387         self._x_forwarded_for_ip = None
388         self.set_downloader(downloader)
389
390     @classmethod
391     def suitable(cls, url):
392         """Receives a URL and returns True if suitable for this IE."""
393
394         # This does not use has/getattr intentionally - we want to know whether
395         # we have cached the regexp for *this* class, whereas getattr would also
396         # match the superclass
397         if '_VALID_URL_RE' not in cls.__dict__:
398             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
399         return cls._VALID_URL_RE.match(url) is not None
400
401     @classmethod
402     def _match_id(cls, url):
403         if '_VALID_URL_RE' not in cls.__dict__:
404             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
405         m = cls._VALID_URL_RE.match(url)
406         assert m
407         return compat_str(m.group('id'))
408
409     @classmethod
410     def working(cls):
411         """Getter method for _WORKING."""
412         return cls._WORKING
413
414     def initialize(self):
415         """Initializes an instance (authentication, etc)."""
416         self._initialize_geo_bypass({
417             'countries': self._GEO_COUNTRIES,
418             'ip_blocks': self._GEO_IP_BLOCKS,
419         })
420         if not self._ready:
421             self._real_initialize()
422             self._ready = True
423
424     def _initialize_geo_bypass(self, geo_bypass_context):
425         """
426         Initialize geo restriction bypass mechanism.
427
428         This method is used to initialize geo bypass mechanism based on faking
429         X-Forwarded-For HTTP header. A random country from provided country list
430         is selected and a random IP belonging to this country is generated. This
431         IP will be passed as X-Forwarded-For HTTP header in all subsequent
432         HTTP requests.
433
434         This method will be used for initial geo bypass mechanism initialization
435         during the instance initialization with _GEO_COUNTRIES and
436         _GEO_IP_BLOCKS.
437
438         You may also manually call it from extractor's code if geo bypass
439         information is not available beforehand (e.g. obtained during
440         extraction) or due to some other reason. In this case you should pass
441         this information in geo bypass context passed as first argument. It may
442         contain following fields:
443
444         countries:  List of geo unrestricted countries (similar
445                     to _GEO_COUNTRIES)
446         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
447                     (similar to _GEO_IP_BLOCKS)
448
449         """
450         if not self._x_forwarded_for_ip:
451
452             # Geo bypass mechanism is explicitly disabled by user
453             if not self._downloader.params.get('geo_bypass', True):
454                 return
455
456             if not geo_bypass_context:
457                 geo_bypass_context = {}
458
459             # Backward compatibility: previously _initialize_geo_bypass
460             # expected a list of countries, some 3rd party code may still use
461             # it this way
462             if isinstance(geo_bypass_context, (list, tuple)):
463                 geo_bypass_context = {
464                     'countries': geo_bypass_context,
465                 }
466
467             # The whole point of geo bypass mechanism is to fake IP
468             # as X-Forwarded-For HTTP header based on some IP block or
469             # country code.
470
471             # Path 1: bypassing based on IP block in CIDR notation
472
473             # Explicit IP block specified by user, use it right away
474             # regardless of whether extractor is geo bypassable or not
475             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
476
477             # Otherwise use random IP block from geo bypass context but only
478             # if extractor is known as geo bypassable
479             if not ip_block:
480                 ip_blocks = geo_bypass_context.get('ip_blocks')
481                 if self._GEO_BYPASS and ip_blocks:
482                     ip_block = random.choice(ip_blocks)
483
484             if ip_block:
485                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
486                 if self._downloader.params.get('verbose', False):
487                     self._downloader.to_screen(
488                         '[debug] Using fake IP %s as X-Forwarded-For.'
489                         % self._x_forwarded_for_ip)
490                 return
491
492             # Path 2: bypassing based on country code
493
494             # Explicit country code specified by user, use it right away
495             # regardless of whether extractor is geo bypassable or not
496             country = self._downloader.params.get('geo_bypass_country', None)
497
498             # Otherwise use random country code from geo bypass context but
499             # only if extractor is known as geo bypassable
500             if not country:
501                 countries = geo_bypass_context.get('countries')
502                 if self._GEO_BYPASS and countries:
503                     country = random.choice(countries)
504
505             if country:
506                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
507                 if self._downloader.params.get('verbose', False):
508                     self._downloader.to_screen(
509                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
510                         % (self._x_forwarded_for_ip, country.upper()))
511
512     def extract(self, url):
513         """Extracts URL information and returns it in list of dicts."""
514         try:
515             for _ in range(2):
516                 try:
517                     self.initialize()
518                     ie_result = self._real_extract(url)
519                     if self._x_forwarded_for_ip:
520                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
521                     return ie_result
522                 except GeoRestrictedError as e:
523                     if self.__maybe_fake_ip_and_retry(e.countries):
524                         continue
525                     raise
526         except ExtractorError:
527             raise
528         except compat_http_client.IncompleteRead as e:
529             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
530         except (KeyError, StopIteration) as e:
531             raise ExtractorError('An extractor error has occurred.', cause=e)
532
533     def __maybe_fake_ip_and_retry(self, countries):
534         if (not self._downloader.params.get('geo_bypass_country', None) and
535                 self._GEO_BYPASS and
536                 self._downloader.params.get('geo_bypass', True) and
537                 not self._x_forwarded_for_ip and
538                 countries):
539             country_code = random.choice(countries)
540             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
541             if self._x_forwarded_for_ip:
542                 self.report_warning(
543                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
544                     % (self._x_forwarded_for_ip, country_code.upper()))
545                 return True
546         return False
547
548     def set_downloader(self, downloader):
549         """Sets the downloader for this IE."""
550         self._downloader = downloader
551
552     def _real_initialize(self):
553         """Real initialization process. Redefine in subclasses."""
554         pass
555
556     def _real_extract(self, url):
557         """Real extraction process. Redefine in subclasses."""
558         pass
559
560     @classmethod
561     def ie_key(cls):
562         """A string for getting the InfoExtractor with get_info_extractor"""
563         return compat_str(cls.__name__[:-2])
564
565     @property
566     def IE_NAME(self):
567         return compat_str(type(self).__name__[:-2])
568
569     @staticmethod
570     def __can_accept_status_code(err, expected_status):
571         assert isinstance(err, compat_urllib_error.HTTPError)
572         if expected_status is None:
573             return False
574         if isinstance(expected_status, compat_integer_types):
575             return err.code == expected_status
576         elif isinstance(expected_status, (list, tuple)):
577             return err.code in expected_status
578         elif callable(expected_status):
579             return expected_status(err.code) is True
580         else:
581             assert False
582
583     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
584         """
585         Return the response handle.
586
587         See _download_webpage docstring for arguments specification.
588         """
589         if note is None:
590             self.report_download_webpage(video_id)
591         elif note is not False:
592             if video_id is None:
593                 self.to_screen('%s' % (note,))
594             else:
595                 self.to_screen('%s: %s' % (video_id, note))
596
597         # Some sites check X-Forwarded-For HTTP header in order to figure out
598         # the origin of the client behind proxy. This allows bypassing geo
599         # restriction by faking this header's value to IP that belongs to some
600         # geo unrestricted country. We will do so once we encounter any
601         # geo restriction error.
602         if self._x_forwarded_for_ip:
603             if 'X-Forwarded-For' not in headers:
604                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
605
606         if isinstance(url_or_request, compat_urllib_request.Request):
607             url_or_request = update_Request(
608                 url_or_request, data=data, headers=headers, query=query)
609         else:
610             if query:
611                 url_or_request = update_url_query(url_or_request, query)
612             if data is not None or headers:
613                 url_or_request = sanitized_Request(url_or_request, data, headers)
614         try:
615             return self._downloader.urlopen(url_or_request)
616         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
617             if isinstance(err, compat_urllib_error.HTTPError):
618                 if self.__can_accept_status_code(err, expected_status):
619                     # Retain reference to error to prevent file object from
620                     # being closed before it can be read. Works around the
621                     # effects of <https://bugs.python.org/issue15002>
622                     # introduced in Python 3.4.1.
623                     err.fp._error = err
624                     return err.fp
625
626             if errnote is False:
627                 return False
628             if errnote is None:
629                 errnote = 'Unable to download webpage'
630
631             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
632             if fatal:
633                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
634             else:
635                 self._downloader.report_warning(errmsg)
636                 return False
637
638     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
639         """
640         Return a tuple (page content as string, URL handle).
641
642         See _download_webpage docstring for arguments specification.
643         """
644         # Strip hashes from the URL (#1038)
645         if isinstance(url_or_request, (compat_str, str)):
646             url_or_request = url_or_request.partition('#')[0]
647
648         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
649         if urlh is False:
650             assert not fatal
651             return False
652         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
653         return (content, urlh)
654
655     @staticmethod
656     def _guess_encoding_from_content(content_type, webpage_bytes):
657         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
658         if m:
659             encoding = m.group(1)
660         else:
661             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
662                           webpage_bytes[:1024])
663             if m:
664                 encoding = m.group(1).decode('ascii')
665             elif webpage_bytes.startswith(b'\xff\xfe'):
666                 encoding = 'utf-16'
667             else:
668                 encoding = 'utf-8'
669
670         return encoding
671
672     def __check_blocked(self, content):
673         first_block = content[:512]
674         if ('<title>Access to this site is blocked</title>' in content and
675                 'Websense' in first_block):
676             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
677             blocked_iframe = self._html_search_regex(
678                 r'<iframe src="([^"]+)"', content,
679                 'Websense information URL', default=None)
680             if blocked_iframe:
681                 msg += ' Visit %s for more details' % blocked_iframe
682             raise ExtractorError(msg, expected=True)
683         if '<title>The URL you requested has been blocked</title>' in first_block:
684             msg = (
685                 'Access to this webpage has been blocked by Indian censorship. '
686                 'Use a VPN or proxy server (with --proxy) to route around it.')
687             block_msg = self._html_search_regex(
688                 r'</h1><p>(.*?)</p>',
689                 content, 'block message', default=None)
690             if block_msg:
691                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
692             raise ExtractorError(msg, expected=True)
693         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
694                 'blocklist.rkn.gov.ru' in content):
695             raise ExtractorError(
696                 'Access to this webpage has been blocked by decision of the Russian government. '
697                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
698                 expected=True)
699
700     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
701         content_type = urlh.headers.get('Content-Type', '')
702         webpage_bytes = urlh.read()
703         if prefix is not None:
704             webpage_bytes = prefix + webpage_bytes
705         if not encoding:
706             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
707         if self._downloader.params.get('dump_intermediate_pages', False):
708             self.to_screen('Dumping request to ' + urlh.geturl())
709             dump = base64.b64encode(webpage_bytes).decode('ascii')
710             self._downloader.to_screen(dump)
711         if self._downloader.params.get('write_pages', False):
712             basen = '%s_%s' % (video_id, urlh.geturl())
713             if len(basen) > 240:
714                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
715                 basen = basen[:240 - len(h)] + h
716             raw_filename = basen + '.dump'
717             filename = sanitize_filename(raw_filename, restricted=True)
718             self.to_screen('Saving request to ' + filename)
719             # Working around MAX_PATH limitation on Windows (see
720             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
721             if compat_os_name == 'nt':
722                 absfilepath = os.path.abspath(filename)
723                 if len(absfilepath) > 259:
724                     filename = '\\\\?\\' + absfilepath
725             with open(filename, 'wb') as outf:
726                 outf.write(webpage_bytes)
727
728         try:
729             content = webpage_bytes.decode(encoding, 'replace')
730         except LookupError:
731             content = webpage_bytes.decode('utf-8', 'replace')
732
733         self.__check_blocked(content)
734
735         return content
736
737     def _download_webpage(
738             self, url_or_request, video_id, note=None, errnote=None,
739             fatal=True, tries=1, timeout=5, encoding=None, data=None,
740             headers={}, query={}, expected_status=None):
741         """
742         Return the data of the page as a string.
743
744         Arguments:
745         url_or_request -- plain text URL as a string or
746             a compat_urllib_request.Requestobject
747         video_id -- Video/playlist/item identifier (string)
748
749         Keyword arguments:
750         note -- note printed before downloading (string)
751         errnote -- note printed in case of an error (string)
752         fatal -- flag denoting whether error should be considered fatal,
753             i.e. whether it should cause ExtractionError to be raised,
754             otherwise a warning will be reported and extraction continued
755         tries -- number of tries
756         timeout -- sleep interval between tries
757         encoding -- encoding for a page content decoding, guessed automatically
758             when not explicitly specified
759         data -- POST data (bytes)
760         headers -- HTTP headers (dict)
761         query -- URL query (dict)
762         expected_status -- allows to accept failed HTTP requests (non 2xx
763             status code) by explicitly specifying a set of accepted status
764             codes. Can be any of the following entities:
765                 - an integer type specifying an exact failed status code to
766                   accept
767                 - a list or a tuple of integer types specifying a list of
768                   failed status codes to accept
769                 - a callable accepting an actual failed status code and
770                   returning True if it should be accepted
771             Note that this argument does not affect success status codes (2xx)
772             which are always accepted.
773         """
774
775         success = False
776         try_count = 0
777         while success is False:
778             try:
779                 res = self._download_webpage_handle(
780                     url_or_request, video_id, note, errnote, fatal,
781                     encoding=encoding, data=data, headers=headers, query=query,
782                     expected_status=expected_status)
783                 success = True
784             except compat_http_client.IncompleteRead as e:
785                 try_count += 1
786                 if try_count >= tries:
787                     raise e
788                 self._sleep(timeout, video_id)
789         if res is False:
790             return res
791         else:
792             content, _ = res
793             return content
794
795     def _download_xml_handle(
796             self, url_or_request, video_id, note='Downloading XML',
797             errnote='Unable to download XML', transform_source=None,
798             fatal=True, encoding=None, data=None, headers={}, query={},
799             expected_status=None):
800         """
801         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
802
803         See _download_webpage docstring for arguments specification.
804         """
805         res = self._download_webpage_handle(
806             url_or_request, video_id, note, errnote, fatal=fatal,
807             encoding=encoding, data=data, headers=headers, query=query,
808             expected_status=expected_status)
809         if res is False:
810             return res
811         xml_string, urlh = res
812         return self._parse_xml(
813             xml_string, video_id, transform_source=transform_source,
814             fatal=fatal), urlh
815
816     def _download_xml(
817             self, url_or_request, video_id,
818             note='Downloading XML', errnote='Unable to download XML',
819             transform_source=None, fatal=True, encoding=None,
820             data=None, headers={}, query={}, expected_status=None):
821         """
822         Return the xml as an xml.etree.ElementTree.Element.
823
824         See _download_webpage docstring for arguments specification.
825         """
826         res = self._download_xml_handle(
827             url_or_request, video_id, note=note, errnote=errnote,
828             transform_source=transform_source, fatal=fatal, encoding=encoding,
829             data=data, headers=headers, query=query,
830             expected_status=expected_status)
831         return res if res is False else res[0]
832
833     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
834         if transform_source:
835             xml_string = transform_source(xml_string)
836         try:
837             return compat_etree_fromstring(xml_string.encode('utf-8'))
838         except compat_xml_parse_error as ve:
839             errmsg = '%s: Failed to parse XML ' % video_id
840             if fatal:
841                 raise ExtractorError(errmsg, cause=ve)
842             else:
843                 self.report_warning(errmsg + str(ve))
844
845     def _download_json_handle(
846             self, url_or_request, video_id, note='Downloading JSON metadata',
847             errnote='Unable to download JSON metadata', transform_source=None,
848             fatal=True, encoding=None, data=None, headers={}, query={},
849             expected_status=None):
850         """
851         Return a tuple (JSON object, URL handle).
852
853         See _download_webpage docstring for arguments specification.
854         """
855         res = self._download_webpage_handle(
856             url_or_request, video_id, note, errnote, fatal=fatal,
857             encoding=encoding, data=data, headers=headers, query=query,
858             expected_status=expected_status)
859         if res is False:
860             return res
861         json_string, urlh = res
862         return self._parse_json(
863             json_string, video_id, transform_source=transform_source,
864             fatal=fatal), urlh
865
866     def _download_json(
867             self, url_or_request, video_id, note='Downloading JSON metadata',
868             errnote='Unable to download JSON metadata', transform_source=None,
869             fatal=True, encoding=None, data=None, headers={}, query={},
870             expected_status=None):
871         """
872         Return the JSON object as a dict.
873
874         See _download_webpage docstring for arguments specification.
875         """
876         res = self._download_json_handle(
877             url_or_request, video_id, note=note, errnote=errnote,
878             transform_source=transform_source, fatal=fatal, encoding=encoding,
879             data=data, headers=headers, query=query,
880             expected_status=expected_status)
881         return res if res is False else res[0]
882
883     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
884         if transform_source:
885             json_string = transform_source(json_string)
886         try:
887             return json.loads(json_string)
888         except ValueError as ve:
889             errmsg = '%s: Failed to parse JSON ' % video_id
890             if fatal:
891                 raise ExtractorError(errmsg, cause=ve)
892             else:
893                 self.report_warning(errmsg + str(ve))
894
895     def report_warning(self, msg, video_id=None):
896         idstr = '' if video_id is None else '%s: ' % video_id
897         self._downloader.report_warning(
898             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
899
900     def to_screen(self, msg):
901         """Print msg to screen, prefixing it with '[ie_name]'"""
902         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
903
904     def report_extraction(self, id_or_name):
905         """Report information extraction."""
906         self.to_screen('%s: Extracting information' % id_or_name)
907
908     def report_download_webpage(self, video_id):
909         """Report webpage download."""
910         self.to_screen('%s: Downloading webpage' % video_id)
911
912     def report_age_confirmation(self):
913         """Report attempt to confirm age."""
914         self.to_screen('Confirming age')
915
916     def report_login(self):
917         """Report attempt to log in."""
918         self.to_screen('Logging in')
919
920     @staticmethod
921     def raise_login_required(msg='This video is only available for registered users'):
922         raise ExtractorError(
923             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
924             expected=True)
925
926     @staticmethod
927     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
928         raise GeoRestrictedError(msg, countries=countries)
929
930     # Methods for following #608
931     @staticmethod
932     def url_result(url, ie=None, video_id=None, video_title=None):
933         """Returns a URL that points to a page that should be processed"""
934         # TODO: ie should be the class used for getting the info
935         video_info = {'_type': 'url',
936                       'url': url,
937                       'ie_key': ie}
938         if video_id is not None:
939             video_info['id'] = video_id
940         if video_title is not None:
941             video_info['title'] = video_title
942         return video_info
943
944     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
945         urls = orderedSet(
946             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
947             for m in matches)
948         return self.playlist_result(
949             urls, playlist_id=playlist_id, playlist_title=playlist_title)
950
951     @staticmethod
952     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
953         """Returns a playlist"""
954         video_info = {'_type': 'playlist',
955                       'entries': entries}
956         if playlist_id:
957             video_info['id'] = playlist_id
958         if playlist_title:
959             video_info['title'] = playlist_title
960         if playlist_description:
961             video_info['description'] = playlist_description
962         return video_info
963
964     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
965         """
966         Perform a regex search on the given string, using a single or a list of
967         patterns returning the first matching group.
968         In case of failure return a default value or raise a WARNING or a
969         RegexNotFoundError, depending on fatal, specifying the field name.
970         """
971         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
972             mobj = re.search(pattern, string, flags)
973         else:
974             for p in pattern:
975                 mobj = re.search(p, string, flags)
976                 if mobj:
977                     break
978
979         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
980             _name = '\033[0;34m%s\033[0m' % name
981         else:
982             _name = name
983
984         if mobj:
985             if group is None:
986                 # return the first matching group
987                 return next(g for g in mobj.groups() if g is not None)
988             else:
989                 return mobj.group(group)
990         elif default is not NO_DEFAULT:
991             return default
992         elif fatal:
993             raise RegexNotFoundError('Unable to extract %s' % _name)
994         else:
995             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
996             return None
997
998     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
999         """
1000         Like _search_regex, but strips HTML tags and unescapes entities.
1001         """
1002         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1003         if res:
1004             return clean_html(res).strip()
1005         else:
1006             return res
1007
1008     def _get_netrc_login_info(self, netrc_machine=None):
1009         username = None
1010         password = None
1011         netrc_machine = netrc_machine or self._NETRC_MACHINE
1012
1013         if self._downloader.params.get('usenetrc', False):
1014             try:
1015                 info = netrc.netrc().authenticators(netrc_machine)
1016                 if info is not None:
1017                     username = info[0]
1018                     password = info[2]
1019                 else:
1020                     raise netrc.NetrcParseError(
1021                         'No authenticators for %s' % netrc_machine)
1022             except (IOError, netrc.NetrcParseError) as err:
1023                 self._downloader.report_warning(
1024                     'parsing .netrc: %s' % error_to_compat_str(err))
1025
1026         return username, password
1027
1028     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1029         """
1030         Get the login info as (username, password)
1031         First look for the manually specified credentials using username_option
1032         and password_option as keys in params dictionary. If no such credentials
1033         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1034         value.
1035         If there's no info available, return (None, None)
1036         """
1037         if self._downloader is None:
1038             return (None, None)
1039
1040         downloader_params = self._downloader.params
1041
1042         # Attempt to use provided username and password or .netrc data
1043         if downloader_params.get(username_option) is not None:
1044             username = downloader_params[username_option]
1045             password = downloader_params[password_option]
1046         else:
1047             username, password = self._get_netrc_login_info(netrc_machine)
1048
1049         return username, password
1050
1051     def _get_tfa_info(self, note='two-factor verification code'):
1052         """
1053         Get the two-factor authentication info
1054         TODO - asking the user will be required for sms/phone verify
1055         currently just uses the command line option
1056         If there's no info available, return None
1057         """
1058         if self._downloader is None:
1059             return None
1060         downloader_params = self._downloader.params
1061
1062         if downloader_params.get('twofactor') is not None:
1063             return downloader_params['twofactor']
1064
1065         return compat_getpass('Type %s and press [Return]: ' % note)
1066
1067     # Helper functions for extracting OpenGraph info
1068     @staticmethod
1069     def _og_regexes(prop):
1070         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1071         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1072                        % {'prop': re.escape(prop)})
1073         template = r'<meta[^>]+?%s[^>]+?%s'
1074         return [
1075             template % (property_re, content_re),
1076             template % (content_re, property_re),
1077         ]
1078
1079     @staticmethod
1080     def _meta_regex(prop):
1081         return r'''(?isx)<meta
1082                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1083                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1084
1085     def _og_search_property(self, prop, html, name=None, **kargs):
1086         if not isinstance(prop, (list, tuple)):
1087             prop = [prop]
1088         if name is None:
1089             name = 'OpenGraph %s' % prop[0]
1090         og_regexes = []
1091         for p in prop:
1092             og_regexes.extend(self._og_regexes(p))
1093         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1094         if escaped is None:
1095             return None
1096         return unescapeHTML(escaped)
1097
1098     def _og_search_thumbnail(self, html, **kargs):
1099         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1100
1101     def _og_search_description(self, html, **kargs):
1102         return self._og_search_property('description', html, fatal=False, **kargs)
1103
1104     def _og_search_title(self, html, **kargs):
1105         return self._og_search_property('title', html, **kargs)
1106
1107     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1108         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1109         if secure:
1110             regexes = self._og_regexes('video:secure_url') + regexes
1111         return self._html_search_regex(regexes, html, name, **kargs)
1112
1113     def _og_search_url(self, html, **kargs):
1114         return self._og_search_property('url', html, **kargs)
1115
1116     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1117         if not isinstance(name, (list, tuple)):
1118             name = [name]
1119         if display_name is None:
1120             display_name = name[0]
1121         return self._html_search_regex(
1122             [self._meta_regex(n) for n in name],
1123             html, display_name, fatal=fatal, group='content', **kwargs)
1124
1125     def _dc_search_uploader(self, html):
1126         return self._html_search_meta('dc.creator', html, 'uploader')
1127
1128     def _rta_search(self, html):
1129         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1130         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1131                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1132                      html):
1133             return 18
1134         return 0
1135
1136     def _media_rating_search(self, html):
1137         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1138         rating = self._html_search_meta('rating', html)
1139
1140         if not rating:
1141             return None
1142
1143         RATING_TABLE = {
1144             'safe for kids': 0,
1145             'general': 8,
1146             '14 years': 14,
1147             'mature': 17,
1148             'restricted': 19,
1149         }
1150         return RATING_TABLE.get(rating.lower())
1151
1152     def _family_friendly_search(self, html):
1153         # See http://schema.org/VideoObject
1154         family_friendly = self._html_search_meta(
1155             'isFamilyFriendly', html, default=None)
1156
1157         if not family_friendly:
1158             return None
1159
1160         RATING_TABLE = {
1161             '1': 0,
1162             'true': 0,
1163             '0': 18,
1164             'false': 18,
1165         }
1166         return RATING_TABLE.get(family_friendly.lower())
1167
1168     def _twitter_search_player(self, html):
1169         return self._html_search_meta('twitter:player', html,
1170                                       'twitter card player')
1171
1172     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1173         json_ld = self._search_regex(
1174             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1175         default = kwargs.get('default', NO_DEFAULT)
1176         if not json_ld:
1177             return default if default is not NO_DEFAULT else {}
1178         # JSON-LD may be malformed and thus `fatal` should be respected.
1179         # At the same time `default` may be passed that assumes `fatal=False`
1180         # for _search_regex. Let's simulate the same behavior here as well.
1181         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1182         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1183
1184     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1185         if isinstance(json_ld, compat_str):
1186             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1187         if not json_ld:
1188             return {}
1189         info = {}
1190         if not isinstance(json_ld, (list, tuple, dict)):
1191             return info
1192         if isinstance(json_ld, dict):
1193             json_ld = [json_ld]
1194
1195         INTERACTION_TYPE_MAP = {
1196             'CommentAction': 'comment',
1197             'AgreeAction': 'like',
1198             'DisagreeAction': 'dislike',
1199             'LikeAction': 'like',
1200             'DislikeAction': 'dislike',
1201             'ListenAction': 'view',
1202             'WatchAction': 'view',
1203             'ViewAction': 'view',
1204         }
1205
1206         def extract_interaction_statistic(e):
1207             interaction_statistic = e.get('interactionStatistic')
1208             if not isinstance(interaction_statistic, list):
1209                 return
1210             for is_e in interaction_statistic:
1211                 if not isinstance(is_e, dict):
1212                     continue
1213                 if is_e.get('@type') != 'InteractionCounter':
1214                     continue
1215                 interaction_type = is_e.get('interactionType')
1216                 if not isinstance(interaction_type, compat_str):
1217                     continue
1218                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1219                 if interaction_count is None:
1220                     continue
1221                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1222                 if not count_kind:
1223                     continue
1224                 count_key = '%s_count' % count_kind
1225                 if info.get(count_key) is not None:
1226                     continue
1227                 info[count_key] = interaction_count
1228
1229         def extract_video_object(e):
1230             assert e['@type'] == 'VideoObject'
1231             info.update({
1232                 'url': url_or_none(e.get('contentUrl')),
1233                 'title': unescapeHTML(e.get('name')),
1234                 'description': unescapeHTML(e.get('description')),
1235                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1236                 'duration': parse_duration(e.get('duration')),
1237                 'timestamp': unified_timestamp(e.get('uploadDate')),
1238                 'filesize': float_or_none(e.get('contentSize')),
1239                 'tbr': int_or_none(e.get('bitrate')),
1240                 'width': int_or_none(e.get('width')),
1241                 'height': int_or_none(e.get('height')),
1242                 'view_count': int_or_none(e.get('interactionCount')),
1243             })
1244             extract_interaction_statistic(e)
1245
1246         for e in json_ld:
1247             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1248                 item_type = e.get('@type')
1249                 if expected_type is not None and expected_type != item_type:
1250                     return info
1251                 if item_type in ('TVEpisode', 'Episode'):
1252                     episode_name = unescapeHTML(e.get('name'))
1253                     info.update({
1254                         'episode': episode_name,
1255                         'episode_number': int_or_none(e.get('episodeNumber')),
1256                         'description': unescapeHTML(e.get('description')),
1257                     })
1258                     if not info.get('title') and episode_name:
1259                         info['title'] = episode_name
1260                     part_of_season = e.get('partOfSeason')
1261                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1262                         info.update({
1263                             'season': unescapeHTML(part_of_season.get('name')),
1264                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1265                         })
1266                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1267                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1268                         info['series'] = unescapeHTML(part_of_series.get('name'))
1269                 elif item_type == 'Movie':
1270                     info.update({
1271                         'title': unescapeHTML(e.get('name')),
1272                         'description': unescapeHTML(e.get('description')),
1273                         'duration': parse_duration(e.get('duration')),
1274                         'timestamp': unified_timestamp(e.get('dateCreated')),
1275                     })
1276                 elif item_type in ('Article', 'NewsArticle'):
1277                     info.update({
1278                         'timestamp': parse_iso8601(e.get('datePublished')),
1279                         'title': unescapeHTML(e.get('headline')),
1280                         'description': unescapeHTML(e.get('articleBody')),
1281                     })
1282                 elif item_type == 'VideoObject':
1283                     extract_video_object(e)
1284                     continue
1285                 video = e.get('video')
1286                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1287                     extract_video_object(video)
1288                 break
1289         return dict((k, v) for k, v in info.items() if v is not None)
1290
1291     @staticmethod
1292     def _hidden_inputs(html):
1293         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1294         hidden_inputs = {}
1295         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1296             attrs = extract_attributes(input)
1297             if not input:
1298                 continue
1299             if attrs.get('type') not in ('hidden', 'submit'):
1300                 continue
1301             name = attrs.get('name') or attrs.get('id')
1302             value = attrs.get('value')
1303             if name and value is not None:
1304                 hidden_inputs[name] = value
1305         return hidden_inputs
1306
1307     def _form_hidden_inputs(self, form_id, html):
1308         form = self._search_regex(
1309             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1310             html, '%s form' % form_id, group='form')
1311         return self._hidden_inputs(form)
1312
1313     def _sort_formats(self, formats, field_preference=None):
1314         if not formats:
1315             raise ExtractorError('No video formats found')
1316
1317         for f in formats:
1318             # Automatically determine tbr when missing based on abr and vbr (improves
1319             # formats sorting in some cases)
1320             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1321                 f['tbr'] = f['abr'] + f['vbr']
1322
1323         def _formats_key(f):
1324             # TODO remove the following workaround
1325             from ..utils import determine_ext
1326             if not f.get('ext') and 'url' in f:
1327                 f['ext'] = determine_ext(f['url'])
1328
1329             if isinstance(field_preference, (list, tuple)):
1330                 return tuple(
1331                     f.get(field)
1332                     if f.get(field) is not None
1333                     else ('' if field == 'format_id' else -1)
1334                     for field in field_preference)
1335
1336             preference = f.get('preference')
1337             if preference is None:
1338                 preference = 0
1339                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1340                     preference -= 0.5
1341
1342             protocol = f.get('protocol') or determine_protocol(f)
1343             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1344
1345             if f.get('vcodec') == 'none':  # audio only
1346                 preference -= 50
1347                 if self._downloader.params.get('prefer_free_formats'):
1348                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1349                 else:
1350                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1351                 ext_preference = 0
1352                 try:
1353                     audio_ext_preference = ORDER.index(f['ext'])
1354                 except ValueError:
1355                     audio_ext_preference = -1
1356             else:
1357                 if f.get('acodec') == 'none':  # video only
1358                     preference -= 40
1359                 if self._downloader.params.get('prefer_free_formats'):
1360                     ORDER = ['flv', 'mp4', 'webm']
1361                 else:
1362                     ORDER = ['webm', 'flv', 'mp4']
1363                 try:
1364                     ext_preference = ORDER.index(f['ext'])
1365                 except ValueError:
1366                     ext_preference = -1
1367                 audio_ext_preference = 0
1368
1369             return (
1370                 preference,
1371                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1372                 f.get('quality') if f.get('quality') is not None else -1,
1373                 f.get('tbr') if f.get('tbr') is not None else -1,
1374                 f.get('filesize') if f.get('filesize') is not None else -1,
1375                 f.get('vbr') if f.get('vbr') is not None else -1,
1376                 f.get('height') if f.get('height') is not None else -1,
1377                 f.get('width') if f.get('width') is not None else -1,
1378                 proto_preference,
1379                 ext_preference,
1380                 f.get('abr') if f.get('abr') is not None else -1,
1381                 audio_ext_preference,
1382                 f.get('fps') if f.get('fps') is not None else -1,
1383                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1384                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1385                 f.get('format_id') if f.get('format_id') is not None else '',
1386             )
1387         formats.sort(key=_formats_key)
1388
1389     def _check_formats(self, formats, video_id):
1390         if formats:
1391             formats[:] = filter(
1392                 lambda f: self._is_valid_url(
1393                     f['url'], video_id,
1394                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1395                 formats)
1396
1397     @staticmethod
1398     def _remove_duplicate_formats(formats):
1399         format_urls = set()
1400         unique_formats = []
1401         for f in formats:
1402             if f['url'] not in format_urls:
1403                 format_urls.add(f['url'])
1404                 unique_formats.append(f)
1405         formats[:] = unique_formats
1406
1407     def _is_valid_url(self, url, video_id, item='video', headers={}):
1408         url = self._proto_relative_url(url, scheme='http:')
1409         # For now assume non HTTP(S) URLs always valid
1410         if not (url.startswith('http://') or url.startswith('https://')):
1411             return True
1412         try:
1413             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1414             return True
1415         except ExtractorError as e:
1416             if isinstance(e.cause, compat_urllib_error.URLError):
1417                 self.to_screen(
1418                     '%s: %s URL is invalid, skipping' % (video_id, item))
1419                 return False
1420             raise
1421
1422     def http_scheme(self):
1423         """ Either "http:" or "https:", depending on the user's preferences """
1424         return (
1425             'http:'
1426             if self._downloader.params.get('prefer_insecure', False)
1427             else 'https:')
1428
1429     def _proto_relative_url(self, url, scheme=None):
1430         if url is None:
1431             return url
1432         if url.startswith('//'):
1433             if scheme is None:
1434                 scheme = self.http_scheme()
1435             return scheme + url
1436         else:
1437             return url
1438
1439     def _sleep(self, timeout, video_id, msg_template=None):
1440         if msg_template is None:
1441             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1442         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1443         self.to_screen(msg)
1444         time.sleep(timeout)
1445
1446     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1447                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1448                              fatal=True, m3u8_id=None):
1449         manifest = self._download_xml(
1450             manifest_url, video_id, 'Downloading f4m manifest',
1451             'Unable to download f4m manifest',
1452             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1453             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1454             transform_source=transform_source,
1455             fatal=fatal)
1456
1457         if manifest is False:
1458             return []
1459
1460         return self._parse_f4m_formats(
1461             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1462             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1463
1464     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1465                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1466                            fatal=True, m3u8_id=None):
1467         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1468         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1469         if akamai_pv is not None and ';' in akamai_pv.text:
1470             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1471             if playerVerificationChallenge.strip() != '':
1472                 return []
1473
1474         formats = []
1475         manifest_version = '1.0'
1476         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1477         if not media_nodes:
1478             manifest_version = '2.0'
1479             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1480         # Remove unsupported DRM protected media from final formats
1481         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1482         media_nodes = remove_encrypted_media(media_nodes)
1483         if not media_nodes:
1484             return formats
1485
1486         manifest_base_url = get_base_url(manifest)
1487
1488         bootstrap_info = xpath_element(
1489             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1490             'bootstrap info', default=None)
1491
1492         vcodec = None
1493         mime_type = xpath_text(
1494             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1495             'base URL', default=None)
1496         if mime_type and mime_type.startswith('audio/'):
1497             vcodec = 'none'
1498
1499         for i, media_el in enumerate(media_nodes):
1500             tbr = int_or_none(media_el.attrib.get('bitrate'))
1501             width = int_or_none(media_el.attrib.get('width'))
1502             height = int_or_none(media_el.attrib.get('height'))
1503             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1504             # If <bootstrapInfo> is present, the specified f4m is a
1505             # stream-level manifest, and only set-level manifests may refer to
1506             # external resources.  See section 11.4 and section 4 of F4M spec
1507             if bootstrap_info is None:
1508                 media_url = None
1509                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1510                 if manifest_version == '2.0':
1511                     media_url = media_el.attrib.get('href')
1512                 if media_url is None:
1513                     media_url = media_el.attrib.get('url')
1514                 if not media_url:
1515                     continue
1516                 manifest_url = (
1517                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1518                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1519                 # If media_url is itself a f4m manifest do the recursive extraction
1520                 # since bitrates in parent manifest (this one) and media_url manifest
1521                 # may differ leading to inability to resolve the format by requested
1522                 # bitrate in f4m downloader
1523                 ext = determine_ext(manifest_url)
1524                 if ext == 'f4m':
1525                     f4m_formats = self._extract_f4m_formats(
1526                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1527                         transform_source=transform_source, fatal=fatal)
1528                     # Sometimes stream-level manifest contains single media entry that
1529                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1530                     # At the same time parent's media entry in set-level manifest may
1531                     # contain it. We will copy it from parent in such cases.
1532                     if len(f4m_formats) == 1:
1533                         f = f4m_formats[0]
1534                         f.update({
1535                             'tbr': f.get('tbr') or tbr,
1536                             'width': f.get('width') or width,
1537                             'height': f.get('height') or height,
1538                             'format_id': f.get('format_id') if not tbr else format_id,
1539                             'vcodec': vcodec,
1540                         })
1541                     formats.extend(f4m_formats)
1542                     continue
1543                 elif ext == 'm3u8':
1544                     formats.extend(self._extract_m3u8_formats(
1545                         manifest_url, video_id, 'mp4', preference=preference,
1546                         m3u8_id=m3u8_id, fatal=fatal))
1547                     continue
1548             formats.append({
1549                 'format_id': format_id,
1550                 'url': manifest_url,
1551                 'manifest_url': manifest_url,
1552                 'ext': 'flv' if bootstrap_info is not None else None,
1553                 'protocol': 'f4m',
1554                 'tbr': tbr,
1555                 'width': width,
1556                 'height': height,
1557                 'vcodec': vcodec,
1558                 'preference': preference,
1559             })
1560         return formats
1561
1562     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1563         return {
1564             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1565             'url': m3u8_url,
1566             'ext': ext,
1567             'protocol': 'm3u8',
1568             'preference': preference - 100 if preference else -100,
1569             'resolution': 'multiple',
1570             'format_note': 'Quality selection URL',
1571         }
1572
1573     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1574                               entry_protocol='m3u8', preference=None,
1575                               m3u8_id=None, note=None, errnote=None,
1576                               fatal=True, live=False):
1577         res = self._download_webpage_handle(
1578             m3u8_url, video_id,
1579             note=note or 'Downloading m3u8 information',
1580             errnote=errnote or 'Failed to download m3u8 information',
1581             fatal=fatal)
1582
1583         if res is False:
1584             return []
1585
1586         m3u8_doc, urlh = res
1587         m3u8_url = urlh.geturl()
1588
1589         return self._parse_m3u8_formats(
1590             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1591             preference=preference, m3u8_id=m3u8_id, live=live)
1592
1593     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1594                             entry_protocol='m3u8', preference=None,
1595                             m3u8_id=None, live=False):
1596         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1597             return []
1598
1599         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1600             return []
1601
1602         formats = []
1603
1604         format_url = lambda u: (
1605             u
1606             if re.match(r'^https?://', u)
1607             else compat_urlparse.urljoin(m3u8_url, u))
1608
1609         # References:
1610         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1611         # 2. https://github.com/rg3/youtube-dl/issues/12211
1612         # 3. https://github.com/rg3/youtube-dl/issues/18923
1613
1614         # We should try extracting formats only from master playlists [1, 4.3.4],
1615         # i.e. playlists that describe available qualities. On the other hand
1616         # media playlists [1, 4.3.3] should be returned as is since they contain
1617         # just the media without qualities renditions.
1618         # Fortunately, master playlist can be easily distinguished from media
1619         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1620         # master playlist tags MUST NOT appear in a media playist and vice versa.
1621         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1622         # media playlist and MUST NOT appear in master playlist thus we can
1623         # clearly detect media playlist with this criterion.
1624
1625         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1626             return [{
1627                 'url': m3u8_url,
1628                 'format_id': m3u8_id,
1629                 'ext': ext,
1630                 'protocol': entry_protocol,
1631                 'preference': preference,
1632             }]
1633
1634         groups = {}
1635         last_stream_inf = {}
1636
1637         def extract_media(x_media_line):
1638             media = parse_m3u8_attributes(x_media_line)
1639             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1640             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1641             if not (media_type and group_id and name):
1642                 return
1643             groups.setdefault(group_id, []).append(media)
1644             if media_type not in ('VIDEO', 'AUDIO'):
1645                 return
1646             media_url = media.get('URI')
1647             if media_url:
1648                 format_id = []
1649                 for v in (m3u8_id, group_id, name):
1650                     if v:
1651                         format_id.append(v)
1652                 f = {
1653                     'format_id': '-'.join(format_id),
1654                     'url': format_url(media_url),
1655                     'manifest_url': m3u8_url,
1656                     'language': media.get('LANGUAGE'),
1657                     'ext': ext,
1658                     'protocol': entry_protocol,
1659                     'preference': preference,
1660                 }
1661                 if media_type == 'AUDIO':
1662                     f['vcodec'] = 'none'
1663                 formats.append(f)
1664
1665         def build_stream_name():
1666             # Despite specification does not mention NAME attribute for
1667             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1668             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1669             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1670             stream_name = last_stream_inf.get('NAME')
1671             if stream_name:
1672                 return stream_name
1673             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1674             # from corresponding rendition group
1675             stream_group_id = last_stream_inf.get('VIDEO')
1676             if not stream_group_id:
1677                 return
1678             stream_group = groups.get(stream_group_id)
1679             if not stream_group:
1680                 return stream_group_id
1681             rendition = stream_group[0]
1682             return rendition.get('NAME') or stream_group_id
1683
1684         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1685         # chance to detect video only formats when EXT-X-STREAM-INF tags
1686         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1687         for line in m3u8_doc.splitlines():
1688             if line.startswith('#EXT-X-MEDIA:'):
1689                 extract_media(line)
1690
1691         for line in m3u8_doc.splitlines():
1692             if line.startswith('#EXT-X-STREAM-INF:'):
1693                 last_stream_inf = parse_m3u8_attributes(line)
1694             elif line.startswith('#') or not line.strip():
1695                 continue
1696             else:
1697                 tbr = float_or_none(
1698                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1699                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1700                 format_id = []
1701                 if m3u8_id:
1702                     format_id.append(m3u8_id)
1703                 stream_name = build_stream_name()
1704                 # Bandwidth of live streams may differ over time thus making
1705                 # format_id unpredictable. So it's better to keep provided
1706                 # format_id intact.
1707                 if not live:
1708                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1709                 manifest_url = format_url(line.strip())
1710                 f = {
1711                     'format_id': '-'.join(format_id),
1712                     'url': manifest_url,
1713                     'manifest_url': m3u8_url,
1714                     'tbr': tbr,
1715                     'ext': ext,
1716                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1717                     'protocol': entry_protocol,
1718                     'preference': preference,
1719                 }
1720                 resolution = last_stream_inf.get('RESOLUTION')
1721                 if resolution:
1722                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1723                     if mobj:
1724                         f['width'] = int(mobj.group('width'))
1725                         f['height'] = int(mobj.group('height'))
1726                 # Unified Streaming Platform
1727                 mobj = re.search(
1728                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1729                 if mobj:
1730                     abr, vbr = mobj.groups()
1731                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1732                     f.update({
1733                         'vbr': vbr,
1734                         'abr': abr,
1735                     })
1736                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1737                 f.update(codecs)
1738                 audio_group_id = last_stream_inf.get('AUDIO')
1739                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1740                 # references a rendition group MUST have a CODECS attribute.
1741                 # However, this is not always respected, for example, [2]
1742                 # contains EXT-X-STREAM-INF tag which references AUDIO
1743                 # rendition group but does not have CODECS and despite
1744                 # referencing an audio group it represents a complete
1745                 # (with audio and video) format. So, for such cases we will
1746                 # ignore references to rendition groups and treat them
1747                 # as complete formats.
1748                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1749                     audio_group = groups.get(audio_group_id)
1750                     if audio_group and audio_group[0].get('URI'):
1751                         # TODO: update acodec for audio only formats with
1752                         # the same GROUP-ID
1753                         f['acodec'] = 'none'
1754                 formats.append(f)
1755                 last_stream_inf = {}
1756         return formats
1757
1758     @staticmethod
1759     def _xpath_ns(path, namespace=None):
1760         if not namespace:
1761             return path
1762         out = []
1763         for c in path.split('/'):
1764             if not c or c == '.':
1765                 out.append(c)
1766             else:
1767                 out.append('{%s}%s' % (namespace, c))
1768         return '/'.join(out)
1769
1770     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1771         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1772
1773         if smil is False:
1774             assert not fatal
1775             return []
1776
1777         namespace = self._parse_smil_namespace(smil)
1778
1779         return self._parse_smil_formats(
1780             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1781
1782     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1783         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1784         if smil is False:
1785             return {}
1786         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1787
1788     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1789         return self._download_xml(
1790             smil_url, video_id, 'Downloading SMIL file',
1791             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1792
1793     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1794         namespace = self._parse_smil_namespace(smil)
1795
1796         formats = self._parse_smil_formats(
1797             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1798         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1799
1800         video_id = os.path.splitext(url_basename(smil_url))[0]
1801         title = None
1802         description = None
1803         upload_date = None
1804         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1805             name = meta.attrib.get('name')
1806             content = meta.attrib.get('content')
1807             if not name or not content:
1808                 continue
1809             if not title and name == 'title':
1810                 title = content
1811             elif not description and name in ('description', 'abstract'):
1812                 description = content
1813             elif not upload_date and name == 'date':
1814                 upload_date = unified_strdate(content)
1815
1816         thumbnails = [{
1817             'id': image.get('type'),
1818             'url': image.get('src'),
1819             'width': int_or_none(image.get('width')),
1820             'height': int_or_none(image.get('height')),
1821         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1822
1823         return {
1824             'id': video_id,
1825             'title': title or video_id,
1826             'description': description,
1827             'upload_date': upload_date,
1828             'thumbnails': thumbnails,
1829             'formats': formats,
1830             'subtitles': subtitles,
1831         }
1832
1833     def _parse_smil_namespace(self, smil):
1834         return self._search_regex(
1835             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1836
1837     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1838         base = smil_url
1839         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1840             b = meta.get('base') or meta.get('httpBase')
1841             if b:
1842                 base = b
1843                 break
1844
1845         formats = []
1846         rtmp_count = 0
1847         http_count = 0
1848         m3u8_count = 0
1849
1850         srcs = []
1851         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1852         for medium in media:
1853             src = medium.get('src')
1854             if not src or src in srcs:
1855                 continue
1856             srcs.append(src)
1857
1858             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1859             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1860             width = int_or_none(medium.get('width'))
1861             height = int_or_none(medium.get('height'))
1862             proto = medium.get('proto')
1863             ext = medium.get('ext')
1864             src_ext = determine_ext(src)
1865             streamer = medium.get('streamer') or base
1866
1867             if proto == 'rtmp' or streamer.startswith('rtmp'):
1868                 rtmp_count += 1
1869                 formats.append({
1870                     'url': streamer,
1871                     'play_path': src,
1872                     'ext': 'flv',
1873                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1874                     'tbr': bitrate,
1875                     'filesize': filesize,
1876                     'width': width,
1877                     'height': height,
1878                 })
1879                 if transform_rtmp_url:
1880                     streamer, src = transform_rtmp_url(streamer, src)
1881                     formats[-1].update({
1882                         'url': streamer,
1883                         'play_path': src,
1884                     })
1885                 continue
1886
1887             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1888             src_url = src_url.strip()
1889
1890             if proto == 'm3u8' or src_ext == 'm3u8':
1891                 m3u8_formats = self._extract_m3u8_formats(
1892                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1893                 if len(m3u8_formats) == 1:
1894                     m3u8_count += 1
1895                     m3u8_formats[0].update({
1896                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1897                         'tbr': bitrate,
1898                         'width': width,
1899                         'height': height,
1900                     })
1901                 formats.extend(m3u8_formats)
1902             elif src_ext == 'f4m':
1903                 f4m_url = src_url
1904                 if not f4m_params:
1905                     f4m_params = {
1906                         'hdcore': '3.2.0',
1907                         'plugin': 'flowplayer-3.2.0.1',
1908                     }
1909                 f4m_url += '&' if '?' in f4m_url else '?'
1910                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1911                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1912             elif src_ext == 'mpd':
1913                 formats.extend(self._extract_mpd_formats(
1914                     src_url, video_id, mpd_id='dash', fatal=False))
1915             elif re.search(r'\.ism/[Mm]anifest', src_url):
1916                 formats.extend(self._extract_ism_formats(
1917                     src_url, video_id, ism_id='mss', fatal=False))
1918             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1919                 http_count += 1
1920                 formats.append({
1921                     'url': src_url,
1922                     'ext': ext or src_ext or 'flv',
1923                     'format_id': 'http-%d' % (bitrate or http_count),
1924                     'tbr': bitrate,
1925                     'filesize': filesize,
1926                     'width': width,
1927                     'height': height,
1928                 })
1929
1930         return formats
1931
1932     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1933         urls = []
1934         subtitles = {}
1935         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1936             src = textstream.get('src')
1937             if not src or src in urls:
1938                 continue
1939             urls.append(src)
1940             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1941             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1942             subtitles.setdefault(lang, []).append({
1943                 'url': src,
1944                 'ext': ext,
1945             })
1946         return subtitles
1947
1948     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1949         xspf = self._download_xml(
1950             xspf_url, playlist_id, 'Downloading xpsf playlist',
1951             'Unable to download xspf manifest', fatal=fatal)
1952         if xspf is False:
1953             return []
1954         return self._parse_xspf(
1955             xspf, playlist_id, xspf_url=xspf_url,
1956             xspf_base_url=base_url(xspf_url))
1957
1958     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1959         NS_MAP = {
1960             'xspf': 'http://xspf.org/ns/0/',
1961             's1': 'http://static.streamone.nl/player/ns/0',
1962         }
1963
1964         entries = []
1965         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1966             title = xpath_text(
1967                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1968             description = xpath_text(
1969                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1970             thumbnail = xpath_text(
1971                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1972             duration = float_or_none(
1973                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1974
1975             formats = []
1976             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1977                 format_url = urljoin(xspf_base_url, location.text)
1978                 if not format_url:
1979                     continue
1980                 formats.append({
1981                     'url': format_url,
1982                     'manifest_url': xspf_url,
1983                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1984                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1985                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1986                 })
1987             self._sort_formats(formats)
1988
1989             entries.append({
1990                 'id': playlist_id,
1991                 'title': title,
1992                 'description': description,
1993                 'thumbnail': thumbnail,
1994                 'duration': duration,
1995                 'formats': formats,
1996             })
1997         return entries
1998
1999     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2000         res = self._download_xml_handle(
2001             mpd_url, video_id,
2002             note=note or 'Downloading MPD manifest',
2003             errnote=errnote or 'Failed to download MPD manifest',
2004             fatal=fatal)
2005         if res is False:
2006             return []
2007         mpd_doc, urlh = res
2008         mpd_base_url = base_url(urlh.geturl())
2009
2010         return self._parse_mpd_formats(
2011             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2012             formats_dict=formats_dict, mpd_url=mpd_url)
2013
2014     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2015         """
2016         Parse formats from MPD manifest.
2017         References:
2018          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2019             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2020          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2021         """
2022         if mpd_doc.get('type') == 'dynamic':
2023             return []
2024
2025         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2026
2027         def _add_ns(path):
2028             return self._xpath_ns(path, namespace)
2029
2030         def is_drm_protected(element):
2031             return element.find(_add_ns('ContentProtection')) is not None
2032
2033         def extract_multisegment_info(element, ms_parent_info):
2034             ms_info = ms_parent_info.copy()
2035
2036             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2037             # common attributes and elements.  We will only extract relevant
2038             # for us.
2039             def extract_common(source):
2040                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2041                 if segment_timeline is not None:
2042                     s_e = segment_timeline.findall(_add_ns('S'))
2043                     if s_e:
2044                         ms_info['total_number'] = 0
2045                         ms_info['s'] = []
2046                         for s in s_e:
2047                             r = int(s.get('r', 0))
2048                             ms_info['total_number'] += 1 + r
2049                             ms_info['s'].append({
2050                                 't': int(s.get('t', 0)),
2051                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2052                                 'd': int(s.attrib['d']),
2053                                 'r': r,
2054                             })
2055                 start_number = source.get('startNumber')
2056                 if start_number:
2057                     ms_info['start_number'] = int(start_number)
2058                 timescale = source.get('timescale')
2059                 if timescale:
2060                     ms_info['timescale'] = int(timescale)
2061                 segment_duration = source.get('duration')
2062                 if segment_duration:
2063                     ms_info['segment_duration'] = float(segment_duration)
2064
2065             def extract_Initialization(source):
2066                 initialization = source.find(_add_ns('Initialization'))
2067                 if initialization is not None:
2068                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2069
2070             segment_list = element.find(_add_ns('SegmentList'))
2071             if segment_list is not None:
2072                 extract_common(segment_list)
2073                 extract_Initialization(segment_list)
2074                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2075                 if segment_urls_e:
2076                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2077             else:
2078                 segment_template = element.find(_add_ns('SegmentTemplate'))
2079                 if segment_template is not None:
2080                     extract_common(segment_template)
2081                     media = segment_template.get('media')
2082                     if media:
2083                         ms_info['media'] = media
2084                     initialization = segment_template.get('initialization')
2085                     if initialization:
2086                         ms_info['initialization'] = initialization
2087                     else:
2088                         extract_Initialization(segment_template)
2089             return ms_info
2090
2091         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2092         formats = []
2093         for period in mpd_doc.findall(_add_ns('Period')):
2094             period_duration = parse_duration(period.get('duration')) or mpd_duration
2095             period_ms_info = extract_multisegment_info(period, {
2096                 'start_number': 1,
2097                 'timescale': 1,
2098             })
2099             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2100                 if is_drm_protected(adaptation_set):
2101                     continue
2102                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2103                 for representation in adaptation_set.findall(_add_ns('Representation')):
2104                     if is_drm_protected(representation):
2105                         continue
2106                     representation_attrib = adaptation_set.attrib.copy()
2107                     representation_attrib.update(representation.attrib)
2108                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2109                     mime_type = representation_attrib['mimeType']
2110                     content_type = mime_type.split('/')[0]
2111                     if content_type == 'text':
2112                         # TODO implement WebVTT downloading
2113                         pass
2114                     elif content_type in ('video', 'audio'):
2115                         base_url = ''
2116                         for element in (representation, adaptation_set, period, mpd_doc):
2117                             base_url_e = element.find(_add_ns('BaseURL'))
2118                             if base_url_e is not None:
2119                                 base_url = base_url_e.text + base_url
2120                                 if re.match(r'^https?://', base_url):
2121                                     break
2122                         if mpd_base_url and not re.match(r'^https?://', base_url):
2123                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2124                                 mpd_base_url += '/'
2125                             base_url = mpd_base_url + base_url
2126                         representation_id = representation_attrib.get('id')
2127                         lang = representation_attrib.get('lang')
2128                         url_el = representation.find(_add_ns('BaseURL'))
2129                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2130                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2131                         f = {
2132                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2133                             'url': mpd_url,
2134                             'manifest_url': mpd_url,
2135                             'ext': mimetype2ext(mime_type),
2136                             'width': int_or_none(representation_attrib.get('width')),
2137                             'height': int_or_none(representation_attrib.get('height')),
2138                             'tbr': float_or_none(bandwidth, 1000),
2139                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2140                             'fps': int_or_none(representation_attrib.get('frameRate')),
2141                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2142                             'format_note': 'DASH %s' % content_type,
2143                             'filesize': filesize,
2144                             'container': mimetype2ext(mime_type) + '_dash',
2145                         }
2146                         f.update(parse_codecs(representation_attrib.get('codecs')))
2147                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2148
2149                         def prepare_template(template_name, identifiers):
2150                             tmpl = representation_ms_info[template_name]
2151                             # First of, % characters outside $...$ templates
2152                             # must be escaped by doubling for proper processing
2153                             # by % operator string formatting used further (see
2154                             # https://github.com/rg3/youtube-dl/issues/16867).
2155                             t = ''
2156                             in_template = False
2157                             for c in tmpl:
2158                                 t += c
2159                                 if c == '$':
2160                                     in_template = not in_template
2161                                 elif c == '%' and not in_template:
2162                                     t += c
2163                             # Next, $...$ templates are translated to their
2164                             # %(...) counterparts to be used with % operator
2165                             t = t.replace('$RepresentationID$', representation_id)
2166                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2167                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2168                             t.replace('$$', '$')
2169                             return t
2170
2171                         # @initialization is a regular template like @media one
2172                         # so it should be handled just the same way (see
2173                         # https://github.com/rg3/youtube-dl/issues/11605)
2174                         if 'initialization' in representation_ms_info:
2175                             initialization_template = prepare_template(
2176                                 'initialization',
2177                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2178                                 # $Time$ shall not be included for @initialization thus
2179                                 # only $Bandwidth$ remains
2180                                 ('Bandwidth', ))
2181                             representation_ms_info['initialization_url'] = initialization_template % {
2182                                 'Bandwidth': bandwidth,
2183                             }
2184
2185                         def location_key(location):
2186                             return 'url' if re.match(r'^https?://', location) else 'path'
2187
2188                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2189
2190                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2191                             media_location_key = location_key(media_template)
2192
2193                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2194                             # can't be used at the same time
2195                             if '%(Number' in media_template and 's' not in representation_ms_info:
2196                                 segment_duration = None
2197                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2198                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2199                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2200                                 representation_ms_info['fragments'] = [{
2201                                     media_location_key: media_template % {
2202                                         'Number': segment_number,
2203                                         'Bandwidth': bandwidth,
2204                                     },
2205                                     'duration': segment_duration,
2206                                 } for segment_number in range(
2207                                     representation_ms_info['start_number'],
2208                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2209                             else:
2210                                 # $Number*$ or $Time$ in media template with S list available
2211                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2212                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2213                                 representation_ms_info['fragments'] = []
2214                                 segment_time = 0
2215                                 segment_d = None
2216                                 segment_number = representation_ms_info['start_number']
2217
2218                                 def add_segment_url():
2219                                     segment_url = media_template % {
2220                                         'Time': segment_time,
2221                                         'Bandwidth': bandwidth,
2222                                         'Number': segment_number,
2223                                     }
2224                                     representation_ms_info['fragments'].append({
2225                                         media_location_key: segment_url,
2226                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2227                                     })
2228
2229                                 for num, s in enumerate(representation_ms_info['s']):
2230                                     segment_time = s.get('t') or segment_time
2231                                     segment_d = s['d']
2232                                     add_segment_url()
2233                                     segment_number += 1
2234                                     for r in range(s.get('r', 0)):
2235                                         segment_time += segment_d
2236                                         add_segment_url()
2237                                         segment_number += 1
2238                                     segment_time += segment_d
2239                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2240                             # No media template
2241                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2242                             # or any YouTube dashsegments video
2243                             fragments = []
2244                             segment_index = 0
2245                             timescale = representation_ms_info['timescale']
2246                             for s in representation_ms_info['s']:
2247                                 duration = float_or_none(s['d'], timescale)
2248                                 for r in range(s.get('r', 0) + 1):
2249                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2250                                     fragments.append({
2251                                         location_key(segment_uri): segment_uri,
2252                                         'duration': duration,
2253                                     })
2254                                     segment_index += 1
2255                             representation_ms_info['fragments'] = fragments
2256                         elif 'segment_urls' in representation_ms_info:
2257                             # Segment URLs with no SegmentTimeline
2258                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2259                             # https://github.com/rg3/youtube-dl/pull/14844
2260                             fragments = []
2261                             segment_duration = float_or_none(
2262                                 representation_ms_info['segment_duration'],
2263                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2264                             for segment_url in representation_ms_info['segment_urls']:
2265                                 fragment = {
2266                                     location_key(segment_url): segment_url,
2267                                 }
2268                                 if segment_duration:
2269                                     fragment['duration'] = segment_duration
2270                                 fragments.append(fragment)
2271                             representation_ms_info['fragments'] = fragments
2272                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2273                         # No fragments key is present in this case.
2274                         if 'fragments' in representation_ms_info:
2275                             f.update({
2276                                 'fragment_base_url': base_url,
2277                                 'fragments': [],
2278                                 'protocol': 'http_dash_segments',
2279                             })
2280                             if 'initialization_url' in representation_ms_info:
2281                                 initialization_url = representation_ms_info['initialization_url']
2282                                 if not f.get('url'):
2283                                     f['url'] = initialization_url
2284                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2285                             f['fragments'].extend(representation_ms_info['fragments'])
2286                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2287                         # is not necessarily unique within a Period thus formats with
2288                         # the same `format_id` are quite possible. There are numerous examples
2289                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2290                         # https://github.com/rg3/youtube-dl/issues/13919)
2291                         full_info = formats_dict.get(representation_id, {}).copy()
2292                         full_info.update(f)
2293                         formats.append(full_info)
2294                     else:
2295                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2296         return formats
2297
2298     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2299         res = self._download_xml_handle(
2300             ism_url, video_id,
2301             note=note or 'Downloading ISM manifest',
2302             errnote=errnote or 'Failed to download ISM manifest',
2303             fatal=fatal)
2304         if res is False:
2305             return []
2306         ism_doc, urlh = res
2307
2308         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2309
2310     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2311         """
2312         Parse formats from ISM manifest.
2313         References:
2314          1. [MS-SSTR]: Smooth Streaming Protocol,
2315             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2316         """
2317         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2318             return []
2319
2320         duration = int(ism_doc.attrib['Duration'])
2321         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2322
2323         formats = []
2324         for stream in ism_doc.findall('StreamIndex'):
2325             stream_type = stream.get('Type')
2326             if stream_type not in ('video', 'audio'):
2327                 continue
2328             url_pattern = stream.attrib['Url']
2329             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2330             stream_name = stream.get('Name')
2331             for track in stream.findall('QualityLevel'):
2332                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2333                 # TODO: add support for WVC1 and WMAP
2334                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2335                     self.report_warning('%s is not a supported codec' % fourcc)
2336                     continue
2337                 tbr = int(track.attrib['Bitrate']) // 1000
2338                 # [1] does not mention Width and Height attributes. However,
2339                 # they're often present while MaxWidth and MaxHeight are
2340                 # missing, so should be used as fallbacks
2341                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2342                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2343                 sampling_rate = int_or_none(track.get('SamplingRate'))
2344
2345                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2346                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2347
2348                 fragments = []
2349                 fragment_ctx = {
2350                     'time': 0,
2351                 }
2352                 stream_fragments = stream.findall('c')
2353                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2354                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2355                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2356                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2357                     if not fragment_ctx['duration']:
2358                         try:
2359                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2360                         except IndexError:
2361                             next_fragment_time = duration
2362                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2363                     for _ in range(fragment_repeat):
2364                         fragments.append({
2365                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2366                             'duration': fragment_ctx['duration'] / stream_timescale,
2367                         })
2368                         fragment_ctx['time'] += fragment_ctx['duration']
2369
2370                 format_id = []
2371                 if ism_id:
2372                     format_id.append(ism_id)
2373                 if stream_name:
2374                     format_id.append(stream_name)
2375                 format_id.append(compat_str(tbr))
2376
2377                 formats.append({
2378                     'format_id': '-'.join(format_id),
2379                     'url': ism_url,
2380                     'manifest_url': ism_url,
2381                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2382                     'width': width,
2383                     'height': height,
2384                     'tbr': tbr,
2385                     'asr': sampling_rate,
2386                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2387                     'acodec': 'none' if stream_type == 'video' else fourcc,
2388                     'protocol': 'ism',
2389                     'fragments': fragments,
2390                     '_download_params': {
2391                         'duration': duration,
2392                         'timescale': stream_timescale,
2393                         'width': width or 0,
2394                         'height': height or 0,
2395                         'fourcc': fourcc,
2396                         'codec_private_data': track.get('CodecPrivateData'),
2397                         'sampling_rate': sampling_rate,
2398                         'channels': int_or_none(track.get('Channels', 2)),
2399                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2400                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2401                     },
2402                 })
2403         return formats
2404
2405     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2406         def absolute_url(item_url):
2407             return urljoin(base_url, item_url)
2408
2409         def parse_content_type(content_type):
2410             if not content_type:
2411                 return {}
2412             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2413             if ctr:
2414                 mimetype, codecs = ctr.groups()
2415                 f = parse_codecs(codecs)
2416                 f['ext'] = mimetype2ext(mimetype)
2417                 return f
2418             return {}
2419
2420         def _media_formats(src, cur_media_type, type_info={}):
2421             full_url = absolute_url(src)
2422             ext = type_info.get('ext') or determine_ext(full_url)
2423             if ext == 'm3u8':
2424                 is_plain_url = False
2425                 formats = self._extract_m3u8_formats(
2426                     full_url, video_id, ext='mp4',
2427                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2428                     preference=preference, fatal=False)
2429             elif ext == 'mpd':
2430                 is_plain_url = False
2431                 formats = self._extract_mpd_formats(
2432                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2433             else:
2434                 is_plain_url = True
2435                 formats = [{
2436                     'url': full_url,
2437                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2438                 }]
2439             return is_plain_url, formats
2440
2441         entries = []
2442         # amp-video and amp-audio are very similar to their HTML5 counterparts
2443         # so we wll include them right here (see
2444         # https://www.ampproject.org/docs/reference/components/amp-video)
2445         media_tags = [(media_tag, media_type, '')
2446                       for media_tag, media_type
2447                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2448         media_tags.extend(re.findall(
2449             # We only allow video|audio followed by a whitespace or '>'.
2450             # Allowing more characters may end up in significant slow down (see
2451             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2452             # http://www.porntrex.com/maps/videositemap.xml).
2453             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2454         for media_tag, media_type, media_content in media_tags:
2455             media_info = {
2456                 'formats': [],
2457                 'subtitles': {},
2458             }
2459             media_attributes = extract_attributes(media_tag)
2460             src = media_attributes.get('src')
2461             if src:
2462                 _, formats = _media_formats(src, media_type)
2463                 media_info['formats'].extend(formats)
2464             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2465             if media_content:
2466                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2467                     source_attributes = extract_attributes(source_tag)
2468                     src = source_attributes.get('src')
2469                     if not src:
2470                         continue
2471                     f = parse_content_type(source_attributes.get('type'))
2472                     is_plain_url, formats = _media_formats(src, media_type, f)
2473                     if is_plain_url:
2474                         # res attribute is not standard but seen several times
2475                         # in the wild
2476                         f.update({
2477                             'height': int_or_none(source_attributes.get('res')),
2478                             'format_id': source_attributes.get('label'),
2479                         })
2480                         f.update(formats[0])
2481                         media_info['formats'].append(f)
2482                     else:
2483                         media_info['formats'].extend(formats)
2484                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2485                     track_attributes = extract_attributes(track_tag)
2486                     kind = track_attributes.get('kind')
2487                     if not kind or kind in ('subtitles', 'captions'):
2488                         src = track_attributes.get('src')
2489                         if not src:
2490                             continue
2491                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2492                         media_info['subtitles'].setdefault(lang, []).append({
2493                             'url': absolute_url(src),
2494                         })
2495             for f in media_info['formats']:
2496                 f.setdefault('http_headers', {})['Referer'] = base_url
2497             if media_info['formats'] or media_info['subtitles']:
2498                 entries.append(media_info)
2499         return entries
2500
2501     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2502         formats = []
2503         hdcore_sign = 'hdcore=3.7.0'
2504         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2505         hds_host = hosts.get('hds')
2506         if hds_host:
2507             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2508         if 'hdcore=' not in f4m_url:
2509             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2510         f4m_formats = self._extract_f4m_formats(
2511             f4m_url, video_id, f4m_id='hds', fatal=False)
2512         for entry in f4m_formats:
2513             entry.update({'extra_param_to_segment_url': hdcore_sign})
2514         formats.extend(f4m_formats)
2515         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2516         hls_host = hosts.get('hls')
2517         if hls_host:
2518             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2519         formats.extend(self._extract_m3u8_formats(
2520             m3u8_url, video_id, 'mp4', 'm3u8_native',
2521             m3u8_id='hls', fatal=False))
2522         return formats
2523
2524     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2525         query = compat_urlparse.urlparse(url).query
2526         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2527         mobj = re.search(
2528             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2529         url_base = mobj.group('url')
2530         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2531         formats = []
2532
2533         def manifest_url(manifest):
2534             m_url = '%s/%s' % (http_base_url, manifest)
2535             if query:
2536                 m_url += '?%s' % query
2537             return m_url
2538
2539         if 'm3u8' not in skip_protocols:
2540             formats.extend(self._extract_m3u8_formats(
2541                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2542                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2543         if 'f4m' not in skip_protocols:
2544             formats.extend(self._extract_f4m_formats(
2545                 manifest_url('manifest.f4m'),
2546                 video_id, f4m_id='hds', fatal=False))
2547         if 'dash' not in skip_protocols:
2548             formats.extend(self._extract_mpd_formats(
2549                 manifest_url('manifest.mpd'),
2550                 video_id, mpd_id='dash', fatal=False))
2551         if re.search(r'(?:/smil:|\.smil)', url_base):
2552             if 'smil' not in skip_protocols:
2553                 rtmp_formats = self._extract_smil_formats(
2554                     manifest_url('jwplayer.smil'),
2555                     video_id, fatal=False)
2556                 for rtmp_format in rtmp_formats:
2557                     rtsp_format = rtmp_format.copy()
2558                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2559                     del rtsp_format['play_path']
2560                     del rtsp_format['ext']
2561                     rtsp_format.update({
2562                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2563                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2564                         'protocol': 'rtsp',
2565                     })
2566                     formats.extend([rtmp_format, rtsp_format])
2567         else:
2568             for protocol in ('rtmp', 'rtsp'):
2569                 if protocol not in skip_protocols:
2570                     formats.append({
2571                         'url': '%s:%s' % (protocol, url_base),
2572                         'format_id': protocol,
2573                         'protocol': protocol,
2574                     })
2575         return formats
2576
2577     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2578         mobj = re.search(
2579             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2580             webpage)
2581         if mobj:
2582             try:
2583                 jwplayer_data = self._parse_json(mobj.group('options'),
2584                                                  video_id=video_id,
2585                                                  transform_source=transform_source)
2586             except ExtractorError:
2587                 pass
2588             else:
2589                 if isinstance(jwplayer_data, dict):
2590                     return jwplayer_data
2591
2592     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2593         jwplayer_data = self._find_jwplayer_data(
2594             webpage, video_id, transform_source=js_to_json)
2595         return self._parse_jwplayer_data(
2596             jwplayer_data, video_id, *args, **kwargs)
2597
2598     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2599                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2600         # JWPlayer backward compatibility: flattened playlists
2601         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2602         if 'playlist' not in jwplayer_data:
2603             jwplayer_data = {'playlist': [jwplayer_data]}
2604
2605         entries = []
2606
2607         # JWPlayer backward compatibility: single playlist item
2608         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2609         if not isinstance(jwplayer_data['playlist'], list):
2610             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2611
2612         for video_data in jwplayer_data['playlist']:
2613             # JWPlayer backward compatibility: flattened sources
2614             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2615             if 'sources' not in video_data:
2616                 video_data['sources'] = [video_data]
2617
2618             this_video_id = video_id or video_data['mediaid']
2619
2620             formats = self._parse_jwplayer_formats(
2621                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2622                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2623
2624             subtitles = {}
2625             tracks = video_data.get('tracks')
2626             if tracks and isinstance(tracks, list):
2627                 for track in tracks:
2628                     if not isinstance(track, dict):
2629                         continue
2630                     track_kind = track.get('kind')
2631                     if not track_kind or not isinstance(track_kind, compat_str):
2632                         continue
2633                     if track_kind.lower() not in ('captions', 'subtitles'):
2634                         continue
2635                     track_url = urljoin(base_url, track.get('file'))
2636                     if not track_url:
2637                         continue
2638                     subtitles.setdefault(track.get('label') or 'en', []).append({
2639                         'url': self._proto_relative_url(track_url)
2640                     })
2641
2642             entry = {
2643                 'id': this_video_id,
2644                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2645                 'description': video_data.get('description'),
2646                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2647                 'timestamp': int_or_none(video_data.get('pubdate')),
2648                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2649                 'subtitles': subtitles,
2650             }
2651             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2652             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2653                 entry.update({
2654                     '_type': 'url_transparent',
2655                     'url': formats[0]['url'],
2656                 })
2657             else:
2658                 self._sort_formats(formats)
2659                 entry['formats'] = formats
2660             entries.append(entry)
2661         if len(entries) == 1:
2662             return entries[0]
2663         else:
2664             return self.playlist_result(entries)
2665
2666     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2667                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2668         urls = []
2669         formats = []
2670         for source in jwplayer_sources_data:
2671             if not isinstance(source, dict):
2672                 continue
2673             source_url = urljoin(
2674                 base_url, self._proto_relative_url(source.get('file')))
2675             if not source_url or source_url in urls:
2676                 continue
2677             urls.append(source_url)
2678             source_type = source.get('type') or ''
2679             ext = mimetype2ext(source_type) or determine_ext(source_url)
2680             if source_type == 'hls' or ext == 'm3u8':
2681                 formats.extend(self._extract_m3u8_formats(
2682                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2683                     m3u8_id=m3u8_id, fatal=False))
2684             elif source_type == 'dash' or ext == 'mpd':
2685                 formats.extend(self._extract_mpd_formats(
2686                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2687             elif ext == 'smil':
2688                 formats.extend(self._extract_smil_formats(
2689                     source_url, video_id, fatal=False))
2690             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2691             elif source_type.startswith('audio') or ext in (
2692                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2693                 formats.append({
2694                     'url': source_url,
2695                     'vcodec': 'none',
2696                     'ext': ext,
2697                 })
2698             else:
2699                 height = int_or_none(source.get('height'))
2700                 if height is None:
2701                     # Often no height is provided but there is a label in
2702                     # format like "1080p", "720p SD", or 1080.
2703                     height = int_or_none(self._search_regex(
2704                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2705                         'height', default=None))
2706                 a_format = {
2707                     'url': source_url,
2708                     'width': int_or_none(source.get('width')),
2709                     'height': height,
2710                     'tbr': int_or_none(source.get('bitrate')),
2711                     'ext': ext,
2712                 }
2713                 if source_url.startswith('rtmp'):
2714                     a_format['ext'] = 'flv'
2715                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2716                     # of jwplayer.flash.swf
2717                     rtmp_url_parts = re.split(
2718                         r'((?:mp4|mp3|flv):)', source_url, 1)
2719                     if len(rtmp_url_parts) == 3:
2720                         rtmp_url, prefix, play_path = rtmp_url_parts
2721                         a_format.update({
2722                             'url': rtmp_url,
2723                             'play_path': prefix + play_path,
2724                         })
2725                     if rtmp_params:
2726                         a_format.update(rtmp_params)
2727                 formats.append(a_format)
2728         return formats
2729
2730     def _live_title(self, name):
2731         """ Generate the title for a live video """
2732         now = datetime.datetime.now()
2733         now_str = now.strftime('%Y-%m-%d %H:%M')
2734         return name + ' ' + now_str
2735
2736     def _int(self, v, name, fatal=False, **kwargs):
2737         res = int_or_none(v, **kwargs)
2738         if 'get_attr' in kwargs:
2739             print(getattr(v, kwargs['get_attr']))
2740         if res is None:
2741             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2742             if fatal:
2743                 raise ExtractorError(msg)
2744             else:
2745                 self._downloader.report_warning(msg)
2746         return res
2747
2748     def _float(self, v, name, fatal=False, **kwargs):
2749         res = float_or_none(v, **kwargs)
2750         if res is None:
2751             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2752             if fatal:
2753                 raise ExtractorError(msg)
2754             else:
2755                 self._downloader.report_warning(msg)
2756         return res
2757
2758     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2759                     path='/', secure=False, discard=False, rest={}, **kwargs):
2760         cookie = compat_cookiejar.Cookie(
2761             0, name, value, port, port is not None, domain, True,
2762             domain.startswith('.'), path, True, secure, expire_time,
2763             discard, None, None, rest)
2764         self._downloader.cookiejar.set_cookie(cookie)
2765
2766     def _get_cookies(self, url):
2767         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2768         req = sanitized_Request(url)
2769         self._downloader.cookiejar.add_cookie_header(req)
2770         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2771
2772     def get_testcases(self, include_onlymatching=False):
2773         t = getattr(self, '_TEST', None)
2774         if t:
2775             assert not hasattr(self, '_TESTS'), \
2776                 '%s has _TEST and _TESTS' % type(self).__name__
2777             tests = [t]
2778         else:
2779             tests = getattr(self, '_TESTS', [])
2780         for t in tests:
2781             if not include_onlymatching and t.get('only_matching', False):
2782                 continue
2783             t['name'] = type(self).__name__[:-len('IE')]
2784             yield t
2785
2786     def is_suitable(self, age_limit):
2787         """ Test whether the extractor is generally suitable for the given
2788         age limit (i.e. pornographic sites are not, all others usually are) """
2789
2790         any_restricted = False
2791         for tc in self.get_testcases(include_onlymatching=False):
2792             if tc.get('playlist', []):
2793                 tc = tc['playlist'][0]
2794             is_restricted = age_restricted(
2795                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2796             if not is_restricted:
2797                 return True
2798             any_restricted = any_restricted or is_restricted
2799         return not any_restricted
2800
2801     def extract_subtitles(self, *args, **kwargs):
2802         if (self._downloader.params.get('writesubtitles', False) or
2803                 self._downloader.params.get('listsubtitles')):
2804             return self._get_subtitles(*args, **kwargs)
2805         return {}
2806
2807     def _get_subtitles(self, *args, **kwargs):
2808         raise NotImplementedError('This method must be implemented by subclasses')
2809
2810     @staticmethod
2811     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2812         """ Merge subtitle items for one language. Items with duplicated URLs
2813         will be dropped. """
2814         list1_urls = set([item['url'] for item in subtitle_list1])
2815         ret = list(subtitle_list1)
2816         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2817         return ret
2818
2819     @classmethod
2820     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2821         """ Merge two subtitle dictionaries, language by language. """
2822         ret = dict(subtitle_dict1)
2823         for lang in subtitle_dict2:
2824             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2825         return ret
2826
2827     def extract_automatic_captions(self, *args, **kwargs):
2828         if (self._downloader.params.get('writeautomaticsub', False) or
2829                 self._downloader.params.get('listsubtitles')):
2830             return self._get_automatic_captions(*args, **kwargs)
2831         return {}
2832
2833     def _get_automatic_captions(self, *args, **kwargs):
2834         raise NotImplementedError('This method must be implemented by subclasses')
2835
2836     def mark_watched(self, *args, **kwargs):
2837         if (self._downloader.params.get('mark_watched', False) and
2838                 (self._get_login_info()[0] is not None or
2839                     self._downloader.params.get('cookiefile') is not None)):
2840             self._mark_watched(*args, **kwargs)
2841
2842     def _mark_watched(self, *args, **kwargs):
2843         raise NotImplementedError('This method must be implemented by subclasses')
2844
2845     def geo_verification_headers(self):
2846         headers = {}
2847         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2848         if geo_verification_proxy:
2849             headers['Ytdl-request-proxy'] = geo_verification_proxy
2850         return headers
2851
2852     def _generic_id(self, url):
2853         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2854
2855     def _generic_title(self, url):
2856         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2857
2858
2859 class SearchInfoExtractor(InfoExtractor):
2860     """
2861     Base class for paged search queries extractors.
2862     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2863     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2864     """
2865
2866     @classmethod
2867     def _make_valid_url(cls):
2868         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2869
2870     @classmethod
2871     def suitable(cls, url):
2872         return re.match(cls._make_valid_url(), url) is not None
2873
2874     def _real_extract(self, query):
2875         mobj = re.match(self._make_valid_url(), query)
2876         if mobj is None:
2877             raise ExtractorError('Invalid search query "%s"' % query)
2878
2879         prefix = mobj.group('prefix')
2880         query = mobj.group('query')
2881         if prefix == '':
2882             return self._get_n_results(query, 1)
2883         elif prefix == 'all':
2884             return self._get_n_results(query, self._MAX_RESULTS)
2885         else:
2886             n = int(prefix)
2887             if n <= 0:
2888                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2889             elif n > self._MAX_RESULTS:
2890                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2891                 n = self._MAX_RESULTS
2892             return self._get_n_results(query, n)
2893
2894     def _get_n_results(self, query, n):
2895         """Get a specified number of results for a query"""
2896         raise NotImplementedError('This method must be implemented by subclasses')
2897
2898     @property
2899     def SEARCH_KEY(self):
2900         return self._SEARCH_KEY