[extractor/common] Use source URL as Referer for HTML5 entries (closes #16849)
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     mimetype2ext,
56     orderedSet,
57     parse_codecs,
58     parse_duration,
59     parse_iso8601,
60     parse_m3u8_attributes,
61     RegexNotFoundError,
62     sanitized_Request,
63     sanitize_filename,
64     unescapeHTML,
65     unified_strdate,
66     unified_timestamp,
67     update_Request,
68     update_url_query,
69     urljoin,
70     url_basename,
71     xpath_element,
72     xpath_text,
73     xpath_with_ns,
74 )
75
76
77 class InfoExtractor(object):
78     """Information Extractor class.
79
80     Information extractors are the classes that, given a URL, extract
81     information about the video (or videos) the URL refers to. This
82     information includes the real video URL, the video title, author and
83     others. The information is stored in a dictionary which is then
84     passed to the YoutubeDL. The YoutubeDL processes this
85     information possibly downloading the video to the file system, among
86     other possible outcomes.
87
88     The type field determines the type of the result.
89     By far the most common value (and the default if _type is missing) is
90     "video", which indicates a single video.
91
92     For a video, the dictionaries must include the following fields:
93
94     id:             Video identifier.
95     title:          Video title, unescaped.
96
97     Additionally, it must contain either a formats entry or a url one:
98
99     formats:        A list of dictionaries for each format available, ordered
100                     from worst to best quality.
101
102                     Potential fields:
103                     * url        Mandatory. The URL of the video file
104                     * manifest_url
105                                  The URL of the manifest file in case of
106                                  fragmented media (DASH, hls, hds)
107                     * ext        Will be calculated from URL if missing
108                     * format     A human-readable description of the format
109                                  ("mp4 container with h264/opus").
110                                  Calculated from the format_id, width, height.
111                                  and format_note fields if missing.
112                     * format_id  A short description of the format
113                                  ("mp4_h264_opus" or "19").
114                                 Technically optional, but strongly recommended.
115                     * format_note Additional info about the format
116                                  ("3D" or "DASH video")
117                     * width      Width of the video, if known
118                     * height     Height of the video, if known
119                     * resolution Textual description of width and height
120                     * tbr        Average bitrate of audio and video in KBit/s
121                     * abr        Average audio bitrate in KBit/s
122                     * acodec     Name of the audio codec in use
123                     * asr        Audio sampling rate in Hertz
124                     * vbr        Average video bitrate in KBit/s
125                     * fps        Frame rate
126                     * vcodec     Name of the video codec in use
127                     * container  Name of the container format
128                     * filesize   The number of bytes, if known in advance
129                     * filesize_approx  An estimate for the number of bytes
130                     * player_url SWF Player URL (used for rtmpdump).
131                     * protocol   The protocol that will be used for the actual
132                                  download, lower-case.
133                                  "http", "https", "rtsp", "rtmp", "rtmpe",
134                                  "m3u8", "m3u8_native" or "http_dash_segments".
135                     * fragment_base_url
136                                  Base URL for fragments. Each fragment's path
137                                  value (if present) will be relative to
138                                  this URL.
139                     * fragments  A list of fragments of a fragmented media.
140                                  Each fragment entry must contain either an url
141                                  or a path. If an url is present it should be
142                                  considered by a client. Otherwise both path and
143                                  fragment_base_url must be present. Here is
144                                  the list of all potential fields:
145                                  * "url" - fragment's URL
146                                  * "path" - fragment's path relative to
147                                             fragment_base_url
148                                  * "duration" (optional, int or float)
149                                  * "filesize" (optional, int)
150                     * preference Order number of this format. If this field is
151                                  present and not None, the formats get sorted
152                                  by this field, regardless of all other values.
153                                  -1 for default (order by other properties),
154                                  -2 or smaller for less than default.
155                                  < -1000 to hide the format (if there is
156                                     another one which is strictly better)
157                     * language   Language code, e.g. "de" or "en-US".
158                     * language_preference  Is this in the language mentioned in
159                                  the URL?
160                                  10 if it's what the URL is about,
161                                  -1 for default (don't know),
162                                  -10 otherwise, other values reserved for now.
163                     * quality    Order number of the video quality of this
164                                  format, irrespective of the file format.
165                                  -1 for default (order by other properties),
166                                  -2 or smaller for less than default.
167                     * source_preference  Order number for this video source
168                                   (quality takes higher priority)
169                                  -1 for default (order by other properties),
170                                  -2 or smaller for less than default.
171                     * http_headers  A dictionary of additional HTTP headers
172                                  to add to the request.
173                     * stretched_ratio  If given and not 1, indicates that the
174                                  video's pixels are not square.
175                                  width : height ratio as float.
176                     * no_resume  The server does not support resuming the
177                                  (HTTP or RTMP) download. Boolean.
178                     * downloader_options  A dictionary of downloader options as
179                                  described in FileDownloader
180
181     url:            Final video URL.
182     ext:            Video filename extension.
183     format:         The video format, defaults to ext (used for --get-format)
184     player_url:     SWF Player URL (used for rtmpdump).
185
186     The following fields are optional:
187
188     alt_title:      A secondary title of the video.
189     display_id      An alternative identifier for the video, not necessarily
190                     unique, but available before title. Typically, id is
191                     something like "4234987", title "Dancing naked mole rats",
192                     and display_id "dancing-naked-mole-rats"
193     thumbnails:     A list of dictionaries, with the following entries:
194                         * "id" (optional, string) - Thumbnail format ID
195                         * "url"
196                         * "preference" (optional, int) - quality of the image
197                         * "width" (optional, int)
198                         * "height" (optional, int)
199                         * "resolution" (optional, string "{width}x{height"},
200                                         deprecated)
201                         * "filesize" (optional, int)
202     thumbnail:      Full URL to a video thumbnail image.
203     description:    Full video description.
204     uploader:       Full name of the video uploader.
205     license:        License name the video is licensed under.
206     creator:        The creator of the video.
207     release_date:   The date (YYYYMMDD) when the video was released.
208     timestamp:      UNIX timestamp of the moment the video became available.
209     upload_date:    Video upload date (YYYYMMDD).
210                     If not explicitly set, calculated from timestamp.
211     uploader_id:    Nickname or id of the video uploader.
212     uploader_url:   Full URL to a personal webpage of the video uploader.
213     location:       Physical location where the video was filmed.
214     subtitles:      The available subtitles as a dictionary in the format
215                     {tag: subformats}. "tag" is usually a language code, and
216                     "subformats" is a list sorted from lower to higher
217                     preference, each element is a dictionary with the "ext"
218                     entry and one of:
219                         * "data": The subtitles file contents
220                         * "url": A URL pointing to the subtitles file
221                     "ext" will be calculated from URL if missing
222     automatic_captions: Like 'subtitles', used by the YoutubeIE for
223                     automatically generated captions
224     duration:       Length of the video in seconds, as an integer or float.
225     view_count:     How many users have watched the video on the platform.
226     like_count:     Number of positive ratings of the video
227     dislike_count:  Number of negative ratings of the video
228     repost_count:   Number of reposts of the video
229     average_rating: Average rating give by users, the scale used depends on the webpage
230     comment_count:  Number of comments on the video
231     comments:       A list of comments, each with one or more of the following
232                     properties (all but one of text or html optional):
233                         * "author" - human-readable name of the comment author
234                         * "author_id" - user ID of the comment author
235                         * "id" - Comment ID
236                         * "html" - Comment as HTML
237                         * "text" - Plain text of the comment
238                         * "timestamp" - UNIX timestamp of comment
239                         * "parent" - ID of the comment this one is replying to.
240                                      Set to "root" to indicate that this is a
241                                      comment to the original video.
242     age_limit:      Age restriction for the video, as an integer (years)
243     webpage_url:    The URL to the video webpage, if given to youtube-dl it
244                     should allow to get the same result again. (It will be set
245                     by YoutubeDL if it's missing)
246     categories:     A list of categories that the video falls in, for example
247                     ["Sports", "Berlin"]
248     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
249     is_live:        True, False, or None (=unknown). Whether this video is a
250                     live stream that goes on instead of a fixed-length video.
251     start_time:     Time in seconds where the reproduction should start, as
252                     specified in the URL.
253     end_time:       Time in seconds where the reproduction should end, as
254                     specified in the URL.
255     chapters:       A list of dictionaries, with the following entries:
256                         * "start_time" - The start time of the chapter in seconds
257                         * "end_time" - The end time of the chapter in seconds
258                         * "title" (optional, string)
259
260     The following fields should only be used when the video belongs to some logical
261     chapter or section:
262
263     chapter:        Name or title of the chapter the video belongs to.
264     chapter_number: Number of the chapter the video belongs to, as an integer.
265     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
266
267     The following fields should only be used when the video is an episode of some
268     series, programme or podcast:
269
270     series:         Title of the series or programme the video episode belongs to.
271     season:         Title of the season the video episode belongs to.
272     season_number:  Number of the season the video episode belongs to, as an integer.
273     season_id:      Id of the season the video episode belongs to, as a unicode string.
274     episode:        Title of the video episode. Unlike mandatory video title field,
275                     this field should denote the exact title of the video episode
276                     without any kind of decoration.
277     episode_number: Number of the video episode within a season, as an integer.
278     episode_id:     Id of the video episode, as a unicode string.
279
280     The following fields should only be used when the media is a track or a part of
281     a music album:
282
283     track:          Title of the track.
284     track_number:   Number of the track within an album or a disc, as an integer.
285     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
286                     as a unicode string.
287     artist:         Artist(s) of the track.
288     genre:          Genre(s) of the track.
289     album:          Title of the album the track belongs to.
290     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
291     album_artist:   List of all artists appeared on the album (e.g.
292                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
293                     and compilations).
294     disc_number:    Number of the disc or other physical medium the track belongs to,
295                     as an integer.
296     release_year:   Year (YYYY) when the album was released.
297
298     Unless mentioned otherwise, the fields should be Unicode strings.
299
300     Unless mentioned otherwise, None is equivalent to absence of information.
301
302
303     _type "playlist" indicates multiple videos.
304     There must be a key "entries", which is a list, an iterable, or a PagedList
305     object, each element of which is a valid dictionary by this specification.
306
307     Additionally, playlists can have "id", "title", "description", "uploader",
308     "uploader_id", "uploader_url" attributes with the same semantics as videos
309     (see above).
310
311
312     _type "multi_video" indicates that there are multiple videos that
313     form a single show, for examples multiple acts of an opera or TV episode.
314     It must have an entries key like a playlist and contain all the keys
315     required for a video at the same time.
316
317
318     _type "url" indicates that the video must be extracted from another
319     location, possibly by a different extractor. Its only required key is:
320     "url" - the next URL to extract.
321     The key "ie_key" can be set to the class name (minus the trailing "IE",
322     e.g. "Youtube") if the extractor class is known in advance.
323     Additionally, the dictionary may have any properties of the resolved entity
324     known in advance, for example "title" if the title of the referred video is
325     known ahead of time.
326
327
328     _type "url_transparent" entities have the same specification as "url", but
329     indicate that the given additional information is more precise than the one
330     associated with the resolved URL.
331     This is useful when a site employs a video service that hosts the video and
332     its technical metadata, but that video service does not embed a useful
333     title, description etc.
334
335
336     Subclasses of this one should re-define the _real_initialize() and
337     _real_extract() methods and define a _VALID_URL regexp.
338     Probably, they should also be added to the list of extractors.
339
340     _GEO_BYPASS attribute may be set to False in order to disable
341     geo restriction bypass mechanisms for a particular extractor.
342     Though it won't disable explicit geo restriction bypass based on
343     country code provided with geo_bypass_country.
344
345     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
346     countries for this extractor. One of these countries will be used by
347     geo restriction bypass mechanism right away in order to bypass
348     geo restriction, of course, if the mechanism is not disabled.
349
350     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
351     IP blocks in CIDR notation for this extractor. One of these IP blocks
352     will be used by geo restriction bypass mechanism similarly
353     to _GEO_COUNTRIES.
354
355     Finally, the _WORKING attribute should be set to False for broken IEs
356     in order to warn the users and skip the tests.
357     """
358
359     _ready = False
360     _downloader = None
361     _x_forwarded_for_ip = None
362     _GEO_BYPASS = True
363     _GEO_COUNTRIES = None
364     _GEO_IP_BLOCKS = None
365     _WORKING = True
366
367     def __init__(self, downloader=None):
368         """Constructor. Receives an optional downloader."""
369         self._ready = False
370         self._x_forwarded_for_ip = None
371         self.set_downloader(downloader)
372
373     @classmethod
374     def suitable(cls, url):
375         """Receives a URL and returns True if suitable for this IE."""
376
377         # This does not use has/getattr intentionally - we want to know whether
378         # we have cached the regexp for *this* class, whereas getattr would also
379         # match the superclass
380         if '_VALID_URL_RE' not in cls.__dict__:
381             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
382         return cls._VALID_URL_RE.match(url) is not None
383
384     @classmethod
385     def _match_id(cls, url):
386         if '_VALID_URL_RE' not in cls.__dict__:
387             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
388         m = cls._VALID_URL_RE.match(url)
389         assert m
390         return compat_str(m.group('id'))
391
392     @classmethod
393     def working(cls):
394         """Getter method for _WORKING."""
395         return cls._WORKING
396
397     def initialize(self):
398         """Initializes an instance (authentication, etc)."""
399         self._initialize_geo_bypass({
400             'countries': self._GEO_COUNTRIES,
401             'ip_blocks': self._GEO_IP_BLOCKS,
402         })
403         if not self._ready:
404             self._real_initialize()
405             self._ready = True
406
407     def _initialize_geo_bypass(self, geo_bypass_context):
408         """
409         Initialize geo restriction bypass mechanism.
410
411         This method is used to initialize geo bypass mechanism based on faking
412         X-Forwarded-For HTTP header. A random country from provided country list
413         is selected and a random IP belonging to this country is generated. This
414         IP will be passed as X-Forwarded-For HTTP header in all subsequent
415         HTTP requests.
416
417         This method will be used for initial geo bypass mechanism initialization
418         during the instance initialization with _GEO_COUNTRIES and
419         _GEO_IP_BLOCKS.
420
421         You may also manually call it from extractor's code if geo bypass
422         information is not available beforehand (e.g. obtained during
423         extraction) or due to some other reason. In this case you should pass
424         this information in geo bypass context passed as first argument. It may
425         contain following fields:
426
427         countries:  List of geo unrestricted countries (similar
428                     to _GEO_COUNTRIES)
429         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
430                     (similar to _GEO_IP_BLOCKS)
431
432         """
433         if not self._x_forwarded_for_ip:
434
435             # Geo bypass mechanism is explicitly disabled by user
436             if not self._downloader.params.get('geo_bypass', True):
437                 return
438
439             if not geo_bypass_context:
440                 geo_bypass_context = {}
441
442             # Backward compatibility: previously _initialize_geo_bypass
443             # expected a list of countries, some 3rd party code may still use
444             # it this way
445             if isinstance(geo_bypass_context, (list, tuple)):
446                 geo_bypass_context = {
447                     'countries': geo_bypass_context,
448                 }
449
450             # The whole point of geo bypass mechanism is to fake IP
451             # as X-Forwarded-For HTTP header based on some IP block or
452             # country code.
453
454             # Path 1: bypassing based on IP block in CIDR notation
455
456             # Explicit IP block specified by user, use it right away
457             # regardless of whether extractor is geo bypassable or not
458             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
459
460             # Otherwise use random IP block from geo bypass context but only
461             # if extractor is known as geo bypassable
462             if not ip_block:
463                 ip_blocks = geo_bypass_context.get('ip_blocks')
464                 if self._GEO_BYPASS and ip_blocks:
465                     ip_block = random.choice(ip_blocks)
466
467             if ip_block:
468                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
469                 if self._downloader.params.get('verbose', False):
470                     self._downloader.to_screen(
471                         '[debug] Using fake IP %s as X-Forwarded-For.'
472                         % self._x_forwarded_for_ip)
473                 return
474
475             # Path 2: bypassing based on country code
476
477             # Explicit country code specified by user, use it right away
478             # regardless of whether extractor is geo bypassable or not
479             country = self._downloader.params.get('geo_bypass_country', None)
480
481             # Otherwise use random country code from geo bypass context but
482             # only if extractor is known as geo bypassable
483             if not country:
484                 countries = geo_bypass_context.get('countries')
485                 if self._GEO_BYPASS and countries:
486                     country = random.choice(countries)
487
488             if country:
489                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
490                 if self._downloader.params.get('verbose', False):
491                     self._downloader.to_screen(
492                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
493                         % (self._x_forwarded_for_ip, country.upper()))
494
495     def extract(self, url):
496         """Extracts URL information and returns it in list of dicts."""
497         try:
498             for _ in range(2):
499                 try:
500                     self.initialize()
501                     ie_result = self._real_extract(url)
502                     if self._x_forwarded_for_ip:
503                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
504                     return ie_result
505                 except GeoRestrictedError as e:
506                     if self.__maybe_fake_ip_and_retry(e.countries):
507                         continue
508                     raise
509         except ExtractorError:
510             raise
511         except compat_http_client.IncompleteRead as e:
512             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
513         except (KeyError, StopIteration) as e:
514             raise ExtractorError('An extractor error has occurred.', cause=e)
515
516     def __maybe_fake_ip_and_retry(self, countries):
517         if (not self._downloader.params.get('geo_bypass_country', None) and
518                 self._GEO_BYPASS and
519                 self._downloader.params.get('geo_bypass', True) and
520                 not self._x_forwarded_for_ip and
521                 countries):
522             country_code = random.choice(countries)
523             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
524             if self._x_forwarded_for_ip:
525                 self.report_warning(
526                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
527                     % (self._x_forwarded_for_ip, country_code.upper()))
528                 return True
529         return False
530
531     def set_downloader(self, downloader):
532         """Sets the downloader for this IE."""
533         self._downloader = downloader
534
535     def _real_initialize(self):
536         """Real initialization process. Redefine in subclasses."""
537         pass
538
539     def _real_extract(self, url):
540         """Real extraction process. Redefine in subclasses."""
541         pass
542
543     @classmethod
544     def ie_key(cls):
545         """A string for getting the InfoExtractor with get_info_extractor"""
546         return compat_str(cls.__name__[:-2])
547
548     @property
549     def IE_NAME(self):
550         return compat_str(type(self).__name__[:-2])
551
552     @staticmethod
553     def __can_accept_status_code(err, expected_status):
554         assert isinstance(err, compat_urllib_error.HTTPError)
555         if expected_status is None:
556             return False
557         if isinstance(expected_status, compat_integer_types):
558             return err.code == expected_status
559         elif isinstance(expected_status, (list, tuple)):
560             return err.code in expected_status
561         elif callable(expected_status):
562             return expected_status(err.code) is True
563         else:
564             assert False
565
566     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
567         """
568         Return the response handle.
569
570         See _download_webpage docstring for arguments specification.
571         """
572         if note is None:
573             self.report_download_webpage(video_id)
574         elif note is not False:
575             if video_id is None:
576                 self.to_screen('%s' % (note,))
577             else:
578                 self.to_screen('%s: %s' % (video_id, note))
579
580         # Some sites check X-Forwarded-For HTTP header in order to figure out
581         # the origin of the client behind proxy. This allows bypassing geo
582         # restriction by faking this header's value to IP that belongs to some
583         # geo unrestricted country. We will do so once we encounter any
584         # geo restriction error.
585         if self._x_forwarded_for_ip:
586             if 'X-Forwarded-For' not in headers:
587                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
588
589         if isinstance(url_or_request, compat_urllib_request.Request):
590             url_or_request = update_Request(
591                 url_or_request, data=data, headers=headers, query=query)
592         else:
593             if query:
594                 url_or_request = update_url_query(url_or_request, query)
595             if data is not None or headers:
596                 url_or_request = sanitized_Request(url_or_request, data, headers)
597         try:
598             return self._downloader.urlopen(url_or_request)
599         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
600             if isinstance(err, compat_urllib_error.HTTPError):
601                 if self.__can_accept_status_code(err, expected_status):
602                     return err.fp
603
604             if errnote is False:
605                 return False
606             if errnote is None:
607                 errnote = 'Unable to download webpage'
608
609             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
610             if fatal:
611                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
612             else:
613                 self._downloader.report_warning(errmsg)
614                 return False
615
616     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
617         """
618         Return a tuple (page content as string, URL handle).
619
620         See _download_webpage docstring for arguments specification.
621         """
622         # Strip hashes from the URL (#1038)
623         if isinstance(url_or_request, (compat_str, str)):
624             url_or_request = url_or_request.partition('#')[0]
625
626         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
627         if urlh is False:
628             assert not fatal
629             return False
630         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
631         return (content, urlh)
632
633     @staticmethod
634     def _guess_encoding_from_content(content_type, webpage_bytes):
635         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
636         if m:
637             encoding = m.group(1)
638         else:
639             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
640                           webpage_bytes[:1024])
641             if m:
642                 encoding = m.group(1).decode('ascii')
643             elif webpage_bytes.startswith(b'\xff\xfe'):
644                 encoding = 'utf-16'
645             else:
646                 encoding = 'utf-8'
647
648         return encoding
649
650     def __check_blocked(self, content):
651         first_block = content[:512]
652         if ('<title>Access to this site is blocked</title>' in content and
653                 'Websense' in first_block):
654             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
655             blocked_iframe = self._html_search_regex(
656                 r'<iframe src="([^"]+)"', content,
657                 'Websense information URL', default=None)
658             if blocked_iframe:
659                 msg += ' Visit %s for more details' % blocked_iframe
660             raise ExtractorError(msg, expected=True)
661         if '<title>The URL you requested has been blocked</title>' in first_block:
662             msg = (
663                 'Access to this webpage has been blocked by Indian censorship. '
664                 'Use a VPN or proxy server (with --proxy) to route around it.')
665             block_msg = self._html_search_regex(
666                 r'</h1><p>(.*?)</p>',
667                 content, 'block message', default=None)
668             if block_msg:
669                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
670             raise ExtractorError(msg, expected=True)
671         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
672                 'blocklist.rkn.gov.ru' in content):
673             raise ExtractorError(
674                 'Access to this webpage has been blocked by decision of the Russian government. '
675                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
676                 expected=True)
677
678     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
679         content_type = urlh.headers.get('Content-Type', '')
680         webpage_bytes = urlh.read()
681         if prefix is not None:
682             webpage_bytes = prefix + webpage_bytes
683         if not encoding:
684             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
685         if self._downloader.params.get('dump_intermediate_pages', False):
686             self.to_screen('Dumping request to ' + urlh.geturl())
687             dump = base64.b64encode(webpage_bytes).decode('ascii')
688             self._downloader.to_screen(dump)
689         if self._downloader.params.get('write_pages', False):
690             basen = '%s_%s' % (video_id, urlh.geturl())
691             if len(basen) > 240:
692                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
693                 basen = basen[:240 - len(h)] + h
694             raw_filename = basen + '.dump'
695             filename = sanitize_filename(raw_filename, restricted=True)
696             self.to_screen('Saving request to ' + filename)
697             # Working around MAX_PATH limitation on Windows (see
698             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
699             if compat_os_name == 'nt':
700                 absfilepath = os.path.abspath(filename)
701                 if len(absfilepath) > 259:
702                     filename = '\\\\?\\' + absfilepath
703             with open(filename, 'wb') as outf:
704                 outf.write(webpage_bytes)
705
706         try:
707             content = webpage_bytes.decode(encoding, 'replace')
708         except LookupError:
709             content = webpage_bytes.decode('utf-8', 'replace')
710
711         self.__check_blocked(content)
712
713         return content
714
715     def _download_webpage(
716             self, url_or_request, video_id, note=None, errnote=None,
717             fatal=True, tries=1, timeout=5, encoding=None, data=None,
718             headers={}, query={}, expected_status=None):
719         """
720         Return the data of the page as a string.
721
722         Arguments:
723         url_or_request -- plain text URL as a string or
724             a compat_urllib_request.Requestobject
725         video_id -- Video/playlist/item identifier (string)
726
727         Keyword arguments:
728         note -- note printed before downloading (string)
729         errnote -- note printed in case of an error (string)
730         fatal -- flag denoting whether error should be considered fatal,
731             i.e. whether it should cause ExtractionError to be raised,
732             otherwise a warning will be reported and extraction continued
733         tries -- number of tries
734         timeout -- sleep interval between tries
735         encoding -- encoding for a page content decoding, guessed automatically
736             when not explicitly specified
737         data -- POST data (bytes)
738         headers -- HTTP headers (dict)
739         query -- URL query (dict)
740         expected_status -- allows to accept failed HTTP requests (non 2xx
741             status code) by explicitly specifying a set of accepted status
742             codes. Can be any of the following entities:
743                 - an integer type specifying an exact failed status code to
744                   accept
745                 - a list or a tuple of integer types specifying a list of
746                   failed status codes to accept
747                 - a callable accepting an actual failed status code and
748                   returning True if it should be accepted
749             Note that this argument does not affect success status codes (2xx)
750             which are always accepted.
751         """
752
753         success = False
754         try_count = 0
755         while success is False:
756             try:
757                 res = self._download_webpage_handle(
758                     url_or_request, video_id, note, errnote, fatal,
759                     encoding=encoding, data=data, headers=headers, query=query,
760                     expected_status=expected_status)
761                 success = True
762             except compat_http_client.IncompleteRead as e:
763                 try_count += 1
764                 if try_count >= tries:
765                     raise e
766                 self._sleep(timeout, video_id)
767         if res is False:
768             return res
769         else:
770             content, _ = res
771             return content
772
773     def _download_xml_handle(
774             self, url_or_request, video_id, note='Downloading XML',
775             errnote='Unable to download XML', transform_source=None,
776             fatal=True, encoding=None, data=None, headers={}, query={},
777             expected_status=None):
778         """
779         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
780
781         See _download_webpage docstring for arguments specification.
782         """
783         res = self._download_webpage_handle(
784             url_or_request, video_id, note, errnote, fatal=fatal,
785             encoding=encoding, data=data, headers=headers, query=query,
786             expected_status=expected_status)
787         if res is False:
788             return res
789         xml_string, urlh = res
790         return self._parse_xml(
791             xml_string, video_id, transform_source=transform_source,
792             fatal=fatal), urlh
793
794     def _download_xml(
795             self, url_or_request, video_id,
796             note='Downloading XML', errnote='Unable to download XML',
797             transform_source=None, fatal=True, encoding=None,
798             data=None, headers={}, query={}, expected_status=None):
799         """
800         Return the xml as an xml.etree.ElementTree.Element.
801
802         See _download_webpage docstring for arguments specification.
803         """
804         res = self._download_xml_handle(
805             url_or_request, video_id, note=note, errnote=errnote,
806             transform_source=transform_source, fatal=fatal, encoding=encoding,
807             data=data, headers=headers, query=query,
808             expected_status=expected_status)
809         return res if res is False else res[0]
810
811     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
812         if transform_source:
813             xml_string = transform_source(xml_string)
814         try:
815             return compat_etree_fromstring(xml_string.encode('utf-8'))
816         except compat_xml_parse_error as ve:
817             errmsg = '%s: Failed to parse XML ' % video_id
818             if fatal:
819                 raise ExtractorError(errmsg, cause=ve)
820             else:
821                 self.report_warning(errmsg + str(ve))
822
823     def _download_json_handle(
824             self, url_or_request, video_id, note='Downloading JSON metadata',
825             errnote='Unable to download JSON metadata', transform_source=None,
826             fatal=True, encoding=None, data=None, headers={}, query={},
827             expected_status=None):
828         """
829         Return a tuple (JSON object, URL handle).
830
831         See _download_webpage docstring for arguments specification.
832         """
833         res = self._download_webpage_handle(
834             url_or_request, video_id, note, errnote, fatal=fatal,
835             encoding=encoding, data=data, headers=headers, query=query,
836             expected_status=expected_status)
837         if res is False:
838             return res
839         json_string, urlh = res
840         return self._parse_json(
841             json_string, video_id, transform_source=transform_source,
842             fatal=fatal), urlh
843
844     def _download_json(
845             self, url_or_request, video_id, note='Downloading JSON metadata',
846             errnote='Unable to download JSON metadata', transform_source=None,
847             fatal=True, encoding=None, data=None, headers={}, query={},
848             expected_status=None):
849         """
850         Return the JSON object as a dict.
851
852         See _download_webpage docstring for arguments specification.
853         """
854         res = self._download_json_handle(
855             url_or_request, video_id, note=note, errnote=errnote,
856             transform_source=transform_source, fatal=fatal, encoding=encoding,
857             data=data, headers=headers, query=query,
858             expected_status=expected_status)
859         return res if res is False else res[0]
860
861     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
862         if transform_source:
863             json_string = transform_source(json_string)
864         try:
865             return json.loads(json_string)
866         except ValueError as ve:
867             errmsg = '%s: Failed to parse JSON ' % video_id
868             if fatal:
869                 raise ExtractorError(errmsg, cause=ve)
870             else:
871                 self.report_warning(errmsg + str(ve))
872
873     def report_warning(self, msg, video_id=None):
874         idstr = '' if video_id is None else '%s: ' % video_id
875         self._downloader.report_warning(
876             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
877
878     def to_screen(self, msg):
879         """Print msg to screen, prefixing it with '[ie_name]'"""
880         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
881
882     def report_extraction(self, id_or_name):
883         """Report information extraction."""
884         self.to_screen('%s: Extracting information' % id_or_name)
885
886     def report_download_webpage(self, video_id):
887         """Report webpage download."""
888         self.to_screen('%s: Downloading webpage' % video_id)
889
890     def report_age_confirmation(self):
891         """Report attempt to confirm age."""
892         self.to_screen('Confirming age')
893
894     def report_login(self):
895         """Report attempt to log in."""
896         self.to_screen('Logging in')
897
898     @staticmethod
899     def raise_login_required(msg='This video is only available for registered users'):
900         raise ExtractorError(
901             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
902             expected=True)
903
904     @staticmethod
905     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
906         raise GeoRestrictedError(msg, countries=countries)
907
908     # Methods for following #608
909     @staticmethod
910     def url_result(url, ie=None, video_id=None, video_title=None):
911         """Returns a URL that points to a page that should be processed"""
912         # TODO: ie should be the class used for getting the info
913         video_info = {'_type': 'url',
914                       'url': url,
915                       'ie_key': ie}
916         if video_id is not None:
917             video_info['id'] = video_id
918         if video_title is not None:
919             video_info['title'] = video_title
920         return video_info
921
922     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
923         urls = orderedSet(
924             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
925             for m in matches)
926         return self.playlist_result(
927             urls, playlist_id=playlist_id, playlist_title=playlist_title)
928
929     @staticmethod
930     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
931         """Returns a playlist"""
932         video_info = {'_type': 'playlist',
933                       'entries': entries}
934         if playlist_id:
935             video_info['id'] = playlist_id
936         if playlist_title:
937             video_info['title'] = playlist_title
938         if playlist_description:
939             video_info['description'] = playlist_description
940         return video_info
941
942     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
943         """
944         Perform a regex search on the given string, using a single or a list of
945         patterns returning the first matching group.
946         In case of failure return a default value or raise a WARNING or a
947         RegexNotFoundError, depending on fatal, specifying the field name.
948         """
949         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
950             mobj = re.search(pattern, string, flags)
951         else:
952             for p in pattern:
953                 mobj = re.search(p, string, flags)
954                 if mobj:
955                     break
956
957         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
958             _name = '\033[0;34m%s\033[0m' % name
959         else:
960             _name = name
961
962         if mobj:
963             if group is None:
964                 # return the first matching group
965                 return next(g for g in mobj.groups() if g is not None)
966             else:
967                 return mobj.group(group)
968         elif default is not NO_DEFAULT:
969             return default
970         elif fatal:
971             raise RegexNotFoundError('Unable to extract %s' % _name)
972         else:
973             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
974             return None
975
976     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
977         """
978         Like _search_regex, but strips HTML tags and unescapes entities.
979         """
980         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
981         if res:
982             return clean_html(res).strip()
983         else:
984             return res
985
986     def _get_netrc_login_info(self, netrc_machine=None):
987         username = None
988         password = None
989         netrc_machine = netrc_machine or self._NETRC_MACHINE
990
991         if self._downloader.params.get('usenetrc', False):
992             try:
993                 info = netrc.netrc().authenticators(netrc_machine)
994                 if info is not None:
995                     username = info[0]
996                     password = info[2]
997                 else:
998                     raise netrc.NetrcParseError(
999                         'No authenticators for %s' % netrc_machine)
1000             except (IOError, netrc.NetrcParseError) as err:
1001                 self._downloader.report_warning(
1002                     'parsing .netrc: %s' % error_to_compat_str(err))
1003
1004         return username, password
1005
1006     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1007         """
1008         Get the login info as (username, password)
1009         First look for the manually specified credentials using username_option
1010         and password_option as keys in params dictionary. If no such credentials
1011         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1012         value.
1013         If there's no info available, return (None, None)
1014         """
1015         if self._downloader is None:
1016             return (None, None)
1017
1018         downloader_params = self._downloader.params
1019
1020         # Attempt to use provided username and password or .netrc data
1021         if downloader_params.get(username_option) is not None:
1022             username = downloader_params[username_option]
1023             password = downloader_params[password_option]
1024         else:
1025             username, password = self._get_netrc_login_info(netrc_machine)
1026
1027         return username, password
1028
1029     def _get_tfa_info(self, note='two-factor verification code'):
1030         """
1031         Get the two-factor authentication info
1032         TODO - asking the user will be required for sms/phone verify
1033         currently just uses the command line option
1034         If there's no info available, return None
1035         """
1036         if self._downloader is None:
1037             return None
1038         downloader_params = self._downloader.params
1039
1040         if downloader_params.get('twofactor') is not None:
1041             return downloader_params['twofactor']
1042
1043         return compat_getpass('Type %s and press [Return]: ' % note)
1044
1045     # Helper functions for extracting OpenGraph info
1046     @staticmethod
1047     def _og_regexes(prop):
1048         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1049         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1050                        % {'prop': re.escape(prop)})
1051         template = r'<meta[^>]+?%s[^>]+?%s'
1052         return [
1053             template % (property_re, content_re),
1054             template % (content_re, property_re),
1055         ]
1056
1057     @staticmethod
1058     def _meta_regex(prop):
1059         return r'''(?isx)<meta
1060                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1061                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1062
1063     def _og_search_property(self, prop, html, name=None, **kargs):
1064         if not isinstance(prop, (list, tuple)):
1065             prop = [prop]
1066         if name is None:
1067             name = 'OpenGraph %s' % prop[0]
1068         og_regexes = []
1069         for p in prop:
1070             og_regexes.extend(self._og_regexes(p))
1071         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1072         if escaped is None:
1073             return None
1074         return unescapeHTML(escaped)
1075
1076     def _og_search_thumbnail(self, html, **kargs):
1077         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1078
1079     def _og_search_description(self, html, **kargs):
1080         return self._og_search_property('description', html, fatal=False, **kargs)
1081
1082     def _og_search_title(self, html, **kargs):
1083         return self._og_search_property('title', html, **kargs)
1084
1085     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1086         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1087         if secure:
1088             regexes = self._og_regexes('video:secure_url') + regexes
1089         return self._html_search_regex(regexes, html, name, **kargs)
1090
1091     def _og_search_url(self, html, **kargs):
1092         return self._og_search_property('url', html, **kargs)
1093
1094     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1095         if not isinstance(name, (list, tuple)):
1096             name = [name]
1097         if display_name is None:
1098             display_name = name[0]
1099         return self._html_search_regex(
1100             [self._meta_regex(n) for n in name],
1101             html, display_name, fatal=fatal, group='content', **kwargs)
1102
1103     def _dc_search_uploader(self, html):
1104         return self._html_search_meta('dc.creator', html, 'uploader')
1105
1106     def _rta_search(self, html):
1107         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1108         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1109                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1110                      html):
1111             return 18
1112         return 0
1113
1114     def _media_rating_search(self, html):
1115         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1116         rating = self._html_search_meta('rating', html)
1117
1118         if not rating:
1119             return None
1120
1121         RATING_TABLE = {
1122             'safe for kids': 0,
1123             'general': 8,
1124             '14 years': 14,
1125             'mature': 17,
1126             'restricted': 19,
1127         }
1128         return RATING_TABLE.get(rating.lower())
1129
1130     def _family_friendly_search(self, html):
1131         # See http://schema.org/VideoObject
1132         family_friendly = self._html_search_meta(
1133             'isFamilyFriendly', html, default=None)
1134
1135         if not family_friendly:
1136             return None
1137
1138         RATING_TABLE = {
1139             '1': 0,
1140             'true': 0,
1141             '0': 18,
1142             'false': 18,
1143         }
1144         return RATING_TABLE.get(family_friendly.lower())
1145
1146     def _twitter_search_player(self, html):
1147         return self._html_search_meta('twitter:player', html,
1148                                       'twitter card player')
1149
1150     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1151         json_ld = self._search_regex(
1152             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1153             html, 'JSON-LD', group='json_ld', **kwargs)
1154         default = kwargs.get('default', NO_DEFAULT)
1155         if not json_ld:
1156             return default if default is not NO_DEFAULT else {}
1157         # JSON-LD may be malformed and thus `fatal` should be respected.
1158         # At the same time `default` may be passed that assumes `fatal=False`
1159         # for _search_regex. Let's simulate the same behavior here as well.
1160         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1161         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1162
1163     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1164         if isinstance(json_ld, compat_str):
1165             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1166         if not json_ld:
1167             return {}
1168         info = {}
1169         if not isinstance(json_ld, (list, tuple, dict)):
1170             return info
1171         if isinstance(json_ld, dict):
1172             json_ld = [json_ld]
1173
1174         INTERACTION_TYPE_MAP = {
1175             'CommentAction': 'comment',
1176             'AgreeAction': 'like',
1177             'DisagreeAction': 'dislike',
1178             'LikeAction': 'like',
1179             'DislikeAction': 'dislike',
1180             'ListenAction': 'view',
1181             'WatchAction': 'view',
1182             'ViewAction': 'view',
1183         }
1184
1185         def extract_interaction_statistic(e):
1186             interaction_statistic = e.get('interactionStatistic')
1187             if not isinstance(interaction_statistic, list):
1188                 return
1189             for is_e in interaction_statistic:
1190                 if not isinstance(is_e, dict):
1191                     continue
1192                 if is_e.get('@type') != 'InteractionCounter':
1193                     continue
1194                 interaction_type = is_e.get('interactionType')
1195                 if not isinstance(interaction_type, compat_str):
1196                     continue
1197                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1198                 if interaction_count is None:
1199                     continue
1200                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1201                 if not count_kind:
1202                     continue
1203                 count_key = '%s_count' % count_kind
1204                 if info.get(count_key) is not None:
1205                     continue
1206                 info[count_key] = interaction_count
1207
1208         def extract_video_object(e):
1209             assert e['@type'] == 'VideoObject'
1210             info.update({
1211                 'url': e.get('contentUrl'),
1212                 'title': unescapeHTML(e.get('name')),
1213                 'description': unescapeHTML(e.get('description')),
1214                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1215                 'duration': parse_duration(e.get('duration')),
1216                 'timestamp': unified_timestamp(e.get('uploadDate')),
1217                 'filesize': float_or_none(e.get('contentSize')),
1218                 'tbr': int_or_none(e.get('bitrate')),
1219                 'width': int_or_none(e.get('width')),
1220                 'height': int_or_none(e.get('height')),
1221                 'view_count': int_or_none(e.get('interactionCount')),
1222             })
1223             extract_interaction_statistic(e)
1224
1225         for e in json_ld:
1226             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1227                 item_type = e.get('@type')
1228                 if expected_type is not None and expected_type != item_type:
1229                     return info
1230                 if item_type in ('TVEpisode', 'Episode'):
1231                     info.update({
1232                         'episode': unescapeHTML(e.get('name')),
1233                         'episode_number': int_or_none(e.get('episodeNumber')),
1234                         'description': unescapeHTML(e.get('description')),
1235                     })
1236                     part_of_season = e.get('partOfSeason')
1237                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1238                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1239                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1240                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1241                         info['series'] = unescapeHTML(part_of_series.get('name'))
1242                 elif item_type in ('Article', 'NewsArticle'):
1243                     info.update({
1244                         'timestamp': parse_iso8601(e.get('datePublished')),
1245                         'title': unescapeHTML(e.get('headline')),
1246                         'description': unescapeHTML(e.get('articleBody')),
1247                     })
1248                 elif item_type == 'VideoObject':
1249                     extract_video_object(e)
1250                     continue
1251                 video = e.get('video')
1252                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1253                     extract_video_object(video)
1254                 break
1255         return dict((k, v) for k, v in info.items() if v is not None)
1256
1257     @staticmethod
1258     def _hidden_inputs(html):
1259         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1260         hidden_inputs = {}
1261         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1262             attrs = extract_attributes(input)
1263             if not input:
1264                 continue
1265             if attrs.get('type') not in ('hidden', 'submit'):
1266                 continue
1267             name = attrs.get('name') or attrs.get('id')
1268             value = attrs.get('value')
1269             if name and value is not None:
1270                 hidden_inputs[name] = value
1271         return hidden_inputs
1272
1273     def _form_hidden_inputs(self, form_id, html):
1274         form = self._search_regex(
1275             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1276             html, '%s form' % form_id, group='form')
1277         return self._hidden_inputs(form)
1278
1279     def _sort_formats(self, formats, field_preference=None):
1280         if not formats:
1281             raise ExtractorError('No video formats found')
1282
1283         for f in formats:
1284             # Automatically determine tbr when missing based on abr and vbr (improves
1285             # formats sorting in some cases)
1286             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1287                 f['tbr'] = f['abr'] + f['vbr']
1288
1289         def _formats_key(f):
1290             # TODO remove the following workaround
1291             from ..utils import determine_ext
1292             if not f.get('ext') and 'url' in f:
1293                 f['ext'] = determine_ext(f['url'])
1294
1295             if isinstance(field_preference, (list, tuple)):
1296                 return tuple(
1297                     f.get(field)
1298                     if f.get(field) is not None
1299                     else ('' if field == 'format_id' else -1)
1300                     for field in field_preference)
1301
1302             preference = f.get('preference')
1303             if preference is None:
1304                 preference = 0
1305                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1306                     preference -= 0.5
1307
1308             protocol = f.get('protocol') or determine_protocol(f)
1309             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1310
1311             if f.get('vcodec') == 'none':  # audio only
1312                 preference -= 50
1313                 if self._downloader.params.get('prefer_free_formats'):
1314                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1315                 else:
1316                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1317                 ext_preference = 0
1318                 try:
1319                     audio_ext_preference = ORDER.index(f['ext'])
1320                 except ValueError:
1321                     audio_ext_preference = -1
1322             else:
1323                 if f.get('acodec') == 'none':  # video only
1324                     preference -= 40
1325                 if self._downloader.params.get('prefer_free_formats'):
1326                     ORDER = ['flv', 'mp4', 'webm']
1327                 else:
1328                     ORDER = ['webm', 'flv', 'mp4']
1329                 try:
1330                     ext_preference = ORDER.index(f['ext'])
1331                 except ValueError:
1332                     ext_preference = -1
1333                 audio_ext_preference = 0
1334
1335             return (
1336                 preference,
1337                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1338                 f.get('quality') if f.get('quality') is not None else -1,
1339                 f.get('tbr') if f.get('tbr') is not None else -1,
1340                 f.get('filesize') if f.get('filesize') is not None else -1,
1341                 f.get('vbr') if f.get('vbr') is not None else -1,
1342                 f.get('height') if f.get('height') is not None else -1,
1343                 f.get('width') if f.get('width') is not None else -1,
1344                 proto_preference,
1345                 ext_preference,
1346                 f.get('abr') if f.get('abr') is not None else -1,
1347                 audio_ext_preference,
1348                 f.get('fps') if f.get('fps') is not None else -1,
1349                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1350                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1351                 f.get('format_id') if f.get('format_id') is not None else '',
1352             )
1353         formats.sort(key=_formats_key)
1354
1355     def _check_formats(self, formats, video_id):
1356         if formats:
1357             formats[:] = filter(
1358                 lambda f: self._is_valid_url(
1359                     f['url'], video_id,
1360                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1361                 formats)
1362
1363     @staticmethod
1364     def _remove_duplicate_formats(formats):
1365         format_urls = set()
1366         unique_formats = []
1367         for f in formats:
1368             if f['url'] not in format_urls:
1369                 format_urls.add(f['url'])
1370                 unique_formats.append(f)
1371         formats[:] = unique_formats
1372
1373     def _is_valid_url(self, url, video_id, item='video', headers={}):
1374         url = self._proto_relative_url(url, scheme='http:')
1375         # For now assume non HTTP(S) URLs always valid
1376         if not (url.startswith('http://') or url.startswith('https://')):
1377             return True
1378         try:
1379             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1380             return True
1381         except ExtractorError as e:
1382             if isinstance(e.cause, compat_urllib_error.URLError):
1383                 self.to_screen(
1384                     '%s: %s URL is invalid, skipping' % (video_id, item))
1385                 return False
1386             raise
1387
1388     def http_scheme(self):
1389         """ Either "http:" or "https:", depending on the user's preferences """
1390         return (
1391             'http:'
1392             if self._downloader.params.get('prefer_insecure', False)
1393             else 'https:')
1394
1395     def _proto_relative_url(self, url, scheme=None):
1396         if url is None:
1397             return url
1398         if url.startswith('//'):
1399             if scheme is None:
1400                 scheme = self.http_scheme()
1401             return scheme + url
1402         else:
1403             return url
1404
1405     def _sleep(self, timeout, video_id, msg_template=None):
1406         if msg_template is None:
1407             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1408         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1409         self.to_screen(msg)
1410         time.sleep(timeout)
1411
1412     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1413                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1414                              fatal=True, m3u8_id=None):
1415         manifest = self._download_xml(
1416             manifest_url, video_id, 'Downloading f4m manifest',
1417             'Unable to download f4m manifest',
1418             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1419             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1420             transform_source=transform_source,
1421             fatal=fatal)
1422
1423         if manifest is False:
1424             return []
1425
1426         return self._parse_f4m_formats(
1427             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1428             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1429
1430     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1431                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1432                            fatal=True, m3u8_id=None):
1433         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1434         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1435         if akamai_pv is not None and ';' in akamai_pv.text:
1436             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1437             if playerVerificationChallenge.strip() != '':
1438                 return []
1439
1440         formats = []
1441         manifest_version = '1.0'
1442         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1443         if not media_nodes:
1444             manifest_version = '2.0'
1445             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1446         # Remove unsupported DRM protected media from final formats
1447         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1448         media_nodes = remove_encrypted_media(media_nodes)
1449         if not media_nodes:
1450             return formats
1451
1452         manifest_base_url = get_base_url(manifest)
1453
1454         bootstrap_info = xpath_element(
1455             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1456             'bootstrap info', default=None)
1457
1458         vcodec = None
1459         mime_type = xpath_text(
1460             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1461             'base URL', default=None)
1462         if mime_type and mime_type.startswith('audio/'):
1463             vcodec = 'none'
1464
1465         for i, media_el in enumerate(media_nodes):
1466             tbr = int_or_none(media_el.attrib.get('bitrate'))
1467             width = int_or_none(media_el.attrib.get('width'))
1468             height = int_or_none(media_el.attrib.get('height'))
1469             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1470             # If <bootstrapInfo> is present, the specified f4m is a
1471             # stream-level manifest, and only set-level manifests may refer to
1472             # external resources.  See section 11.4 and section 4 of F4M spec
1473             if bootstrap_info is None:
1474                 media_url = None
1475                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1476                 if manifest_version == '2.0':
1477                     media_url = media_el.attrib.get('href')
1478                 if media_url is None:
1479                     media_url = media_el.attrib.get('url')
1480                 if not media_url:
1481                     continue
1482                 manifest_url = (
1483                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1484                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1485                 # If media_url is itself a f4m manifest do the recursive extraction
1486                 # since bitrates in parent manifest (this one) and media_url manifest
1487                 # may differ leading to inability to resolve the format by requested
1488                 # bitrate in f4m downloader
1489                 ext = determine_ext(manifest_url)
1490                 if ext == 'f4m':
1491                     f4m_formats = self._extract_f4m_formats(
1492                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1493                         transform_source=transform_source, fatal=fatal)
1494                     # Sometimes stream-level manifest contains single media entry that
1495                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1496                     # At the same time parent's media entry in set-level manifest may
1497                     # contain it. We will copy it from parent in such cases.
1498                     if len(f4m_formats) == 1:
1499                         f = f4m_formats[0]
1500                         f.update({
1501                             'tbr': f.get('tbr') or tbr,
1502                             'width': f.get('width') or width,
1503                             'height': f.get('height') or height,
1504                             'format_id': f.get('format_id') if not tbr else format_id,
1505                             'vcodec': vcodec,
1506                         })
1507                     formats.extend(f4m_formats)
1508                     continue
1509                 elif ext == 'm3u8':
1510                     formats.extend(self._extract_m3u8_formats(
1511                         manifest_url, video_id, 'mp4', preference=preference,
1512                         m3u8_id=m3u8_id, fatal=fatal))
1513                     continue
1514             formats.append({
1515                 'format_id': format_id,
1516                 'url': manifest_url,
1517                 'manifest_url': manifest_url,
1518                 'ext': 'flv' if bootstrap_info is not None else None,
1519                 'protocol': 'f4m',
1520                 'tbr': tbr,
1521                 'width': width,
1522                 'height': height,
1523                 'vcodec': vcodec,
1524                 'preference': preference,
1525             })
1526         return formats
1527
1528     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1529         return {
1530             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1531             'url': m3u8_url,
1532             'ext': ext,
1533             'protocol': 'm3u8',
1534             'preference': preference - 100 if preference else -100,
1535             'resolution': 'multiple',
1536             'format_note': 'Quality selection URL',
1537         }
1538
1539     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1540                               entry_protocol='m3u8', preference=None,
1541                               m3u8_id=None, note=None, errnote=None,
1542                               fatal=True, live=False):
1543         res = self._download_webpage_handle(
1544             m3u8_url, video_id,
1545             note=note or 'Downloading m3u8 information',
1546             errnote=errnote or 'Failed to download m3u8 information',
1547             fatal=fatal)
1548
1549         if res is False:
1550             return []
1551
1552         m3u8_doc, urlh = res
1553         m3u8_url = urlh.geturl()
1554
1555         return self._parse_m3u8_formats(
1556             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1557             preference=preference, m3u8_id=m3u8_id, live=live)
1558
1559     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1560                             entry_protocol='m3u8', preference=None,
1561                             m3u8_id=None, live=False):
1562         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1563             return []
1564
1565         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1566             return []
1567
1568         formats = []
1569
1570         format_url = lambda u: (
1571             u
1572             if re.match(r'^https?://', u)
1573             else compat_urlparse.urljoin(m3u8_url, u))
1574
1575         # References:
1576         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1577         # 2. https://github.com/rg3/youtube-dl/issues/12211
1578
1579         # We should try extracting formats only from master playlists [1, 4.3.4],
1580         # i.e. playlists that describe available qualities. On the other hand
1581         # media playlists [1, 4.3.3] should be returned as is since they contain
1582         # just the media without qualities renditions.
1583         # Fortunately, master playlist can be easily distinguished from media
1584         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1585         # master playlist tags MUST NOT appear in a media playist and vice versa.
1586         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1587         # media playlist and MUST NOT appear in master playlist thus we can
1588         # clearly detect media playlist with this criterion.
1589
1590         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1591             return [{
1592                 'url': m3u8_url,
1593                 'format_id': m3u8_id,
1594                 'ext': ext,
1595                 'protocol': entry_protocol,
1596                 'preference': preference,
1597             }]
1598
1599         groups = {}
1600         last_stream_inf = {}
1601
1602         def extract_media(x_media_line):
1603             media = parse_m3u8_attributes(x_media_line)
1604             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1605             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1606             if not (media_type and group_id and name):
1607                 return
1608             groups.setdefault(group_id, []).append(media)
1609             if media_type not in ('VIDEO', 'AUDIO'):
1610                 return
1611             media_url = media.get('URI')
1612             if media_url:
1613                 format_id = []
1614                 for v in (m3u8_id, group_id, name):
1615                     if v:
1616                         format_id.append(v)
1617                 f = {
1618                     'format_id': '-'.join(format_id),
1619                     'url': format_url(media_url),
1620                     'manifest_url': m3u8_url,
1621                     'language': media.get('LANGUAGE'),
1622                     'ext': ext,
1623                     'protocol': entry_protocol,
1624                     'preference': preference,
1625                 }
1626                 if media_type == 'AUDIO':
1627                     f['vcodec'] = 'none'
1628                 formats.append(f)
1629
1630         def build_stream_name():
1631             # Despite specification does not mention NAME attribute for
1632             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1633             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1634             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1635             stream_name = last_stream_inf.get('NAME')
1636             if stream_name:
1637                 return stream_name
1638             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1639             # from corresponding rendition group
1640             stream_group_id = last_stream_inf.get('VIDEO')
1641             if not stream_group_id:
1642                 return
1643             stream_group = groups.get(stream_group_id)
1644             if not stream_group:
1645                 return stream_group_id
1646             rendition = stream_group[0]
1647             return rendition.get('NAME') or stream_group_id
1648
1649         for line in m3u8_doc.splitlines():
1650             if line.startswith('#EXT-X-STREAM-INF:'):
1651                 last_stream_inf = parse_m3u8_attributes(line)
1652             elif line.startswith('#EXT-X-MEDIA:'):
1653                 extract_media(line)
1654             elif line.startswith('#') or not line.strip():
1655                 continue
1656             else:
1657                 tbr = float_or_none(
1658                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1659                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1660                 format_id = []
1661                 if m3u8_id:
1662                     format_id.append(m3u8_id)
1663                 stream_name = build_stream_name()
1664                 # Bandwidth of live streams may differ over time thus making
1665                 # format_id unpredictable. So it's better to keep provided
1666                 # format_id intact.
1667                 if not live:
1668                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1669                 manifest_url = format_url(line.strip())
1670                 f = {
1671                     'format_id': '-'.join(format_id),
1672                     'url': manifest_url,
1673                     'manifest_url': m3u8_url,
1674                     'tbr': tbr,
1675                     'ext': ext,
1676                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1677                     'protocol': entry_protocol,
1678                     'preference': preference,
1679                 }
1680                 resolution = last_stream_inf.get('RESOLUTION')
1681                 if resolution:
1682                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1683                     if mobj:
1684                         f['width'] = int(mobj.group('width'))
1685                         f['height'] = int(mobj.group('height'))
1686                 # Unified Streaming Platform
1687                 mobj = re.search(
1688                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1689                 if mobj:
1690                     abr, vbr = mobj.groups()
1691                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1692                     f.update({
1693                         'vbr': vbr,
1694                         'abr': abr,
1695                     })
1696                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1697                 f.update(codecs)
1698                 audio_group_id = last_stream_inf.get('AUDIO')
1699                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1700                 # references a rendition group MUST have a CODECS attribute.
1701                 # However, this is not always respected, for example, [2]
1702                 # contains EXT-X-STREAM-INF tag which references AUDIO
1703                 # rendition group but does not have CODECS and despite
1704                 # referencing audio group an audio group, it represents
1705                 # a complete (with audio and video) format. So, for such cases
1706                 # we will ignore references to rendition groups and treat them
1707                 # as complete formats.
1708                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1709                     audio_group = groups.get(audio_group_id)
1710                     if audio_group and audio_group[0].get('URI'):
1711                         # TODO: update acodec for audio only formats with
1712                         # the same GROUP-ID
1713                         f['acodec'] = 'none'
1714                 formats.append(f)
1715                 last_stream_inf = {}
1716         return formats
1717
1718     @staticmethod
1719     def _xpath_ns(path, namespace=None):
1720         if not namespace:
1721             return path
1722         out = []
1723         for c in path.split('/'):
1724             if not c or c == '.':
1725                 out.append(c)
1726             else:
1727                 out.append('{%s}%s' % (namespace, c))
1728         return '/'.join(out)
1729
1730     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1731         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1732
1733         if smil is False:
1734             assert not fatal
1735             return []
1736
1737         namespace = self._parse_smil_namespace(smil)
1738
1739         return self._parse_smil_formats(
1740             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1741
1742     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1743         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1744         if smil is False:
1745             return {}
1746         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1747
1748     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1749         return self._download_xml(
1750             smil_url, video_id, 'Downloading SMIL file',
1751             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1752
1753     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1754         namespace = self._parse_smil_namespace(smil)
1755
1756         formats = self._parse_smil_formats(
1757             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1758         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1759
1760         video_id = os.path.splitext(url_basename(smil_url))[0]
1761         title = None
1762         description = None
1763         upload_date = None
1764         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1765             name = meta.attrib.get('name')
1766             content = meta.attrib.get('content')
1767             if not name or not content:
1768                 continue
1769             if not title and name == 'title':
1770                 title = content
1771             elif not description and name in ('description', 'abstract'):
1772                 description = content
1773             elif not upload_date and name == 'date':
1774                 upload_date = unified_strdate(content)
1775
1776         thumbnails = [{
1777             'id': image.get('type'),
1778             'url': image.get('src'),
1779             'width': int_or_none(image.get('width')),
1780             'height': int_or_none(image.get('height')),
1781         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1782
1783         return {
1784             'id': video_id,
1785             'title': title or video_id,
1786             'description': description,
1787             'upload_date': upload_date,
1788             'thumbnails': thumbnails,
1789             'formats': formats,
1790             'subtitles': subtitles,
1791         }
1792
1793     def _parse_smil_namespace(self, smil):
1794         return self._search_regex(
1795             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1796
1797     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1798         base = smil_url
1799         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1800             b = meta.get('base') or meta.get('httpBase')
1801             if b:
1802                 base = b
1803                 break
1804
1805         formats = []
1806         rtmp_count = 0
1807         http_count = 0
1808         m3u8_count = 0
1809
1810         srcs = []
1811         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1812         for medium in media:
1813             src = medium.get('src')
1814             if not src or src in srcs:
1815                 continue
1816             srcs.append(src)
1817
1818             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1819             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1820             width = int_or_none(medium.get('width'))
1821             height = int_or_none(medium.get('height'))
1822             proto = medium.get('proto')
1823             ext = medium.get('ext')
1824             src_ext = determine_ext(src)
1825             streamer = medium.get('streamer') or base
1826
1827             if proto == 'rtmp' or streamer.startswith('rtmp'):
1828                 rtmp_count += 1
1829                 formats.append({
1830                     'url': streamer,
1831                     'play_path': src,
1832                     'ext': 'flv',
1833                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1834                     'tbr': bitrate,
1835                     'filesize': filesize,
1836                     'width': width,
1837                     'height': height,
1838                 })
1839                 if transform_rtmp_url:
1840                     streamer, src = transform_rtmp_url(streamer, src)
1841                     formats[-1].update({
1842                         'url': streamer,
1843                         'play_path': src,
1844                     })
1845                 continue
1846
1847             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1848             src_url = src_url.strip()
1849
1850             if proto == 'm3u8' or src_ext == 'm3u8':
1851                 m3u8_formats = self._extract_m3u8_formats(
1852                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1853                 if len(m3u8_formats) == 1:
1854                     m3u8_count += 1
1855                     m3u8_formats[0].update({
1856                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1857                         'tbr': bitrate,
1858                         'width': width,
1859                         'height': height,
1860                     })
1861                 formats.extend(m3u8_formats)
1862                 continue
1863
1864             if src_ext == 'f4m':
1865                 f4m_url = src_url
1866                 if not f4m_params:
1867                     f4m_params = {
1868                         'hdcore': '3.2.0',
1869                         'plugin': 'flowplayer-3.2.0.1',
1870                     }
1871                 f4m_url += '&' if '?' in f4m_url else '?'
1872                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1873                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1874                 continue
1875
1876             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1877                 http_count += 1
1878                 formats.append({
1879                     'url': src_url,
1880                     'ext': ext or src_ext or 'flv',
1881                     'format_id': 'http-%d' % (bitrate or http_count),
1882                     'tbr': bitrate,
1883                     'filesize': filesize,
1884                     'width': width,
1885                     'height': height,
1886                 })
1887                 continue
1888
1889         return formats
1890
1891     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1892         urls = []
1893         subtitles = {}
1894         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1895             src = textstream.get('src')
1896             if not src or src in urls:
1897                 continue
1898             urls.append(src)
1899             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1900             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1901             subtitles.setdefault(lang, []).append({
1902                 'url': src,
1903                 'ext': ext,
1904             })
1905         return subtitles
1906
1907     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1908         xspf = self._download_xml(
1909             xspf_url, playlist_id, 'Downloading xpsf playlist',
1910             'Unable to download xspf manifest', fatal=fatal)
1911         if xspf is False:
1912             return []
1913         return self._parse_xspf(
1914             xspf, playlist_id, xspf_url=xspf_url,
1915             xspf_base_url=base_url(xspf_url))
1916
1917     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1918         NS_MAP = {
1919             'xspf': 'http://xspf.org/ns/0/',
1920             's1': 'http://static.streamone.nl/player/ns/0',
1921         }
1922
1923         entries = []
1924         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1925             title = xpath_text(
1926                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1927             description = xpath_text(
1928                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1929             thumbnail = xpath_text(
1930                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1931             duration = float_or_none(
1932                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1933
1934             formats = []
1935             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1936                 format_url = urljoin(xspf_base_url, location.text)
1937                 if not format_url:
1938                     continue
1939                 formats.append({
1940                     'url': format_url,
1941                     'manifest_url': xspf_url,
1942                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1943                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1944                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1945                 })
1946             self._sort_formats(formats)
1947
1948             entries.append({
1949                 'id': playlist_id,
1950                 'title': title,
1951                 'description': description,
1952                 'thumbnail': thumbnail,
1953                 'duration': duration,
1954                 'formats': formats,
1955             })
1956         return entries
1957
1958     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1959         res = self._download_xml_handle(
1960             mpd_url, video_id,
1961             note=note or 'Downloading MPD manifest',
1962             errnote=errnote or 'Failed to download MPD manifest',
1963             fatal=fatal)
1964         if res is False:
1965             return []
1966         mpd_doc, urlh = res
1967         mpd_base_url = base_url(urlh.geturl())
1968
1969         return self._parse_mpd_formats(
1970             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1971             formats_dict=formats_dict, mpd_url=mpd_url)
1972
1973     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1974         """
1975         Parse formats from MPD manifest.
1976         References:
1977          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1978             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1979          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1980         """
1981         if mpd_doc.get('type') == 'dynamic':
1982             return []
1983
1984         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1985
1986         def _add_ns(path):
1987             return self._xpath_ns(path, namespace)
1988
1989         def is_drm_protected(element):
1990             return element.find(_add_ns('ContentProtection')) is not None
1991
1992         def extract_multisegment_info(element, ms_parent_info):
1993             ms_info = ms_parent_info.copy()
1994
1995             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1996             # common attributes and elements.  We will only extract relevant
1997             # for us.
1998             def extract_common(source):
1999                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2000                 if segment_timeline is not None:
2001                     s_e = segment_timeline.findall(_add_ns('S'))
2002                     if s_e:
2003                         ms_info['total_number'] = 0
2004                         ms_info['s'] = []
2005                         for s in s_e:
2006                             r = int(s.get('r', 0))
2007                             ms_info['total_number'] += 1 + r
2008                             ms_info['s'].append({
2009                                 't': int(s.get('t', 0)),
2010                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2011                                 'd': int(s.attrib['d']),
2012                                 'r': r,
2013                             })
2014                 start_number = source.get('startNumber')
2015                 if start_number:
2016                     ms_info['start_number'] = int(start_number)
2017                 timescale = source.get('timescale')
2018                 if timescale:
2019                     ms_info['timescale'] = int(timescale)
2020                 segment_duration = source.get('duration')
2021                 if segment_duration:
2022                     ms_info['segment_duration'] = float(segment_duration)
2023
2024             def extract_Initialization(source):
2025                 initialization = source.find(_add_ns('Initialization'))
2026                 if initialization is not None:
2027                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2028
2029             segment_list = element.find(_add_ns('SegmentList'))
2030             if segment_list is not None:
2031                 extract_common(segment_list)
2032                 extract_Initialization(segment_list)
2033                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2034                 if segment_urls_e:
2035                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2036             else:
2037                 segment_template = element.find(_add_ns('SegmentTemplate'))
2038                 if segment_template is not None:
2039                     extract_common(segment_template)
2040                     media = segment_template.get('media')
2041                     if media:
2042                         ms_info['media'] = media
2043                     initialization = segment_template.get('initialization')
2044                     if initialization:
2045                         ms_info['initialization'] = initialization
2046                     else:
2047                         extract_Initialization(segment_template)
2048             return ms_info
2049
2050         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2051         formats = []
2052         for period in mpd_doc.findall(_add_ns('Period')):
2053             period_duration = parse_duration(period.get('duration')) or mpd_duration
2054             period_ms_info = extract_multisegment_info(period, {
2055                 'start_number': 1,
2056                 'timescale': 1,
2057             })
2058             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2059                 if is_drm_protected(adaptation_set):
2060                     continue
2061                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2062                 for representation in adaptation_set.findall(_add_ns('Representation')):
2063                     if is_drm_protected(representation):
2064                         continue
2065                     representation_attrib = adaptation_set.attrib.copy()
2066                     representation_attrib.update(representation.attrib)
2067                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2068                     mime_type = representation_attrib['mimeType']
2069                     content_type = mime_type.split('/')[0]
2070                     if content_type == 'text':
2071                         # TODO implement WebVTT downloading
2072                         pass
2073                     elif content_type in ('video', 'audio'):
2074                         base_url = ''
2075                         for element in (representation, adaptation_set, period, mpd_doc):
2076                             base_url_e = element.find(_add_ns('BaseURL'))
2077                             if base_url_e is not None:
2078                                 base_url = base_url_e.text + base_url
2079                                 if re.match(r'^https?://', base_url):
2080                                     break
2081                         if mpd_base_url and not re.match(r'^https?://', base_url):
2082                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2083                                 mpd_base_url += '/'
2084                             base_url = mpd_base_url + base_url
2085                         representation_id = representation_attrib.get('id')
2086                         lang = representation_attrib.get('lang')
2087                         url_el = representation.find(_add_ns('BaseURL'))
2088                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2089                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2090                         f = {
2091                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2092                             'url': base_url,
2093                             'manifest_url': mpd_url,
2094                             'ext': mimetype2ext(mime_type),
2095                             'width': int_or_none(representation_attrib.get('width')),
2096                             'height': int_or_none(representation_attrib.get('height')),
2097                             'tbr': float_or_none(bandwidth, 1000),
2098                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2099                             'fps': int_or_none(representation_attrib.get('frameRate')),
2100                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2101                             'format_note': 'DASH %s' % content_type,
2102                             'filesize': filesize,
2103                             'container': mimetype2ext(mime_type) + '_dash',
2104                         }
2105                         f.update(parse_codecs(representation_attrib.get('codecs')))
2106                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2107
2108                         def prepare_template(template_name, identifiers):
2109                             t = representation_ms_info[template_name]
2110                             t = t.replace('$RepresentationID$', representation_id)
2111                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2112                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2113                             t.replace('$$', '$')
2114                             return t
2115
2116                         # @initialization is a regular template like @media one
2117                         # so it should be handled just the same way (see
2118                         # https://github.com/rg3/youtube-dl/issues/11605)
2119                         if 'initialization' in representation_ms_info:
2120                             initialization_template = prepare_template(
2121                                 'initialization',
2122                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2123                                 # $Time$ shall not be included for @initialization thus
2124                                 # only $Bandwidth$ remains
2125                                 ('Bandwidth', ))
2126                             representation_ms_info['initialization_url'] = initialization_template % {
2127                                 'Bandwidth': bandwidth,
2128                             }
2129
2130                         def location_key(location):
2131                             return 'url' if re.match(r'^https?://', location) else 'path'
2132
2133                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2134
2135                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2136                             media_location_key = location_key(media_template)
2137
2138                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2139                             # can't be used at the same time
2140                             if '%(Number' in media_template and 's' not in representation_ms_info:
2141                                 segment_duration = None
2142                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2143                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2144                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2145                                 representation_ms_info['fragments'] = [{
2146                                     media_location_key: media_template % {
2147                                         'Number': segment_number,
2148                                         'Bandwidth': bandwidth,
2149                                     },
2150                                     'duration': segment_duration,
2151                                 } for segment_number in range(
2152                                     representation_ms_info['start_number'],
2153                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2154                             else:
2155                                 # $Number*$ or $Time$ in media template with S list available
2156                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2157                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2158                                 representation_ms_info['fragments'] = []
2159                                 segment_time = 0
2160                                 segment_d = None
2161                                 segment_number = representation_ms_info['start_number']
2162
2163                                 def add_segment_url():
2164                                     segment_url = media_template % {
2165                                         'Time': segment_time,
2166                                         'Bandwidth': bandwidth,
2167                                         'Number': segment_number,
2168                                     }
2169                                     representation_ms_info['fragments'].append({
2170                                         media_location_key: segment_url,
2171                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2172                                     })
2173
2174                                 for num, s in enumerate(representation_ms_info['s']):
2175                                     segment_time = s.get('t') or segment_time
2176                                     segment_d = s['d']
2177                                     add_segment_url()
2178                                     segment_number += 1
2179                                     for r in range(s.get('r', 0)):
2180                                         segment_time += segment_d
2181                                         add_segment_url()
2182                                         segment_number += 1
2183                                     segment_time += segment_d
2184                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2185                             # No media template
2186                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2187                             # or any YouTube dashsegments video
2188                             fragments = []
2189                             segment_index = 0
2190                             timescale = representation_ms_info['timescale']
2191                             for s in representation_ms_info['s']:
2192                                 duration = float_or_none(s['d'], timescale)
2193                                 for r in range(s.get('r', 0) + 1):
2194                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2195                                     fragments.append({
2196                                         location_key(segment_uri): segment_uri,
2197                                         'duration': duration,
2198                                     })
2199                                     segment_index += 1
2200                             representation_ms_info['fragments'] = fragments
2201                         elif 'segment_urls' in representation_ms_info:
2202                             # Segment URLs with no SegmentTimeline
2203                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2204                             # https://github.com/rg3/youtube-dl/pull/14844
2205                             fragments = []
2206                             segment_duration = float_or_none(
2207                                 representation_ms_info['segment_duration'],
2208                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2209                             for segment_url in representation_ms_info['segment_urls']:
2210                                 fragment = {
2211                                     location_key(segment_url): segment_url,
2212                                 }
2213                                 if segment_duration:
2214                                     fragment['duration'] = segment_duration
2215                                 fragments.append(fragment)
2216                             representation_ms_info['fragments'] = fragments
2217                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2218                         # No fragments key is present in this case.
2219                         if 'fragments' in representation_ms_info:
2220                             f.update({
2221                                 'fragment_base_url': base_url,
2222                                 'fragments': [],
2223                                 'protocol': 'http_dash_segments',
2224                             })
2225                             if 'initialization_url' in representation_ms_info:
2226                                 initialization_url = representation_ms_info['initialization_url']
2227                                 if not f.get('url'):
2228                                     f['url'] = initialization_url
2229                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2230                             f['fragments'].extend(representation_ms_info['fragments'])
2231                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2232                         # is not necessarily unique within a Period thus formats with
2233                         # the same `format_id` are quite possible. There are numerous examples
2234                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2235                         # https://github.com/rg3/youtube-dl/issues/13919)
2236                         full_info = formats_dict.get(representation_id, {}).copy()
2237                         full_info.update(f)
2238                         formats.append(full_info)
2239                     else:
2240                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2241         return formats
2242
2243     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2244         res = self._download_xml_handle(
2245             ism_url, video_id,
2246             note=note or 'Downloading ISM manifest',
2247             errnote=errnote or 'Failed to download ISM manifest',
2248             fatal=fatal)
2249         if res is False:
2250             return []
2251         ism_doc, urlh = res
2252
2253         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2254
2255     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2256         """
2257         Parse formats from ISM manifest.
2258         References:
2259          1. [MS-SSTR]: Smooth Streaming Protocol,
2260             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2261         """
2262         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2263             return []
2264
2265         duration = int(ism_doc.attrib['Duration'])
2266         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2267
2268         formats = []
2269         for stream in ism_doc.findall('StreamIndex'):
2270             stream_type = stream.get('Type')
2271             if stream_type not in ('video', 'audio'):
2272                 continue
2273             url_pattern = stream.attrib['Url']
2274             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2275             stream_name = stream.get('Name')
2276             for track in stream.findall('QualityLevel'):
2277                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2278                 # TODO: add support for WVC1 and WMAP
2279                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2280                     self.report_warning('%s is not a supported codec' % fourcc)
2281                     continue
2282                 tbr = int(track.attrib['Bitrate']) // 1000
2283                 # [1] does not mention Width and Height attributes. However,
2284                 # they're often present while MaxWidth and MaxHeight are
2285                 # missing, so should be used as fallbacks
2286                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2287                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2288                 sampling_rate = int_or_none(track.get('SamplingRate'))
2289
2290                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2291                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2292
2293                 fragments = []
2294                 fragment_ctx = {
2295                     'time': 0,
2296                 }
2297                 stream_fragments = stream.findall('c')
2298                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2299                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2300                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2301                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2302                     if not fragment_ctx['duration']:
2303                         try:
2304                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2305                         except IndexError:
2306                             next_fragment_time = duration
2307                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2308                     for _ in range(fragment_repeat):
2309                         fragments.append({
2310                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2311                             'duration': fragment_ctx['duration'] / stream_timescale,
2312                         })
2313                         fragment_ctx['time'] += fragment_ctx['duration']
2314
2315                 format_id = []
2316                 if ism_id:
2317                     format_id.append(ism_id)
2318                 if stream_name:
2319                     format_id.append(stream_name)
2320                 format_id.append(compat_str(tbr))
2321
2322                 formats.append({
2323                     'format_id': '-'.join(format_id),
2324                     'url': ism_url,
2325                     'manifest_url': ism_url,
2326                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2327                     'width': width,
2328                     'height': height,
2329                     'tbr': tbr,
2330                     'asr': sampling_rate,
2331                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2332                     'acodec': 'none' if stream_type == 'video' else fourcc,
2333                     'protocol': 'ism',
2334                     'fragments': fragments,
2335                     '_download_params': {
2336                         'duration': duration,
2337                         'timescale': stream_timescale,
2338                         'width': width or 0,
2339                         'height': height or 0,
2340                         'fourcc': fourcc,
2341                         'codec_private_data': track.get('CodecPrivateData'),
2342                         'sampling_rate': sampling_rate,
2343                         'channels': int_or_none(track.get('Channels', 2)),
2344                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2345                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2346                     },
2347                 })
2348         return formats
2349
2350     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2351         def absolute_url(item_url):
2352             return urljoin(base_url, item_url)
2353
2354         def parse_content_type(content_type):
2355             if not content_type:
2356                 return {}
2357             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2358             if ctr:
2359                 mimetype, codecs = ctr.groups()
2360                 f = parse_codecs(codecs)
2361                 f['ext'] = mimetype2ext(mimetype)
2362                 return f
2363             return {}
2364
2365         def _media_formats(src, cur_media_type, type_info={}):
2366             full_url = absolute_url(src)
2367             ext = type_info.get('ext') or determine_ext(full_url)
2368             if ext == 'm3u8':
2369                 is_plain_url = False
2370                 formats = self._extract_m3u8_formats(
2371                     full_url, video_id, ext='mp4',
2372                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2373                     preference=preference, fatal=False)
2374             elif ext == 'mpd':
2375                 is_plain_url = False
2376                 formats = self._extract_mpd_formats(
2377                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2378             else:
2379                 is_plain_url = True
2380                 formats = [{
2381                     'url': full_url,
2382                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2383                 }]
2384             return is_plain_url, formats
2385
2386         entries = []
2387         # amp-video and amp-audio are very similar to their HTML5 counterparts
2388         # so we wll include them right here (see
2389         # https://www.ampproject.org/docs/reference/components/amp-video)
2390         media_tags = [(media_tag, media_type, '')
2391                       for media_tag, media_type
2392                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2393         media_tags.extend(re.findall(
2394             # We only allow video|audio followed by a whitespace or '>'.
2395             # Allowing more characters may end up in significant slow down (see
2396             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2397             # http://www.porntrex.com/maps/videositemap.xml).
2398             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2399         for media_tag, media_type, media_content in media_tags:
2400             media_info = {
2401                 'formats': [],
2402                 'subtitles': {},
2403             }
2404             media_attributes = extract_attributes(media_tag)
2405             src = media_attributes.get('src')
2406             if src:
2407                 _, formats = _media_formats(src, media_type)
2408                 media_info['formats'].extend(formats)
2409             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2410             if media_content:
2411                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2412                     source_attributes = extract_attributes(source_tag)
2413                     src = source_attributes.get('src')
2414                     if not src:
2415                         continue
2416                     f = parse_content_type(source_attributes.get('type'))
2417                     is_plain_url, formats = _media_formats(src, media_type, f)
2418                     if is_plain_url:
2419                         # res attribute is not standard but seen several times
2420                         # in the wild
2421                         f.update({
2422                             'height': int_or_none(source_attributes.get('res')),
2423                             'format_id': source_attributes.get('label'),
2424                         })
2425                         f.update(formats[0])
2426                         media_info['formats'].append(f)
2427                     else:
2428                         media_info['formats'].extend(formats)
2429                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2430                     track_attributes = extract_attributes(track_tag)
2431                     kind = track_attributes.get('kind')
2432                     if not kind or kind in ('subtitles', 'captions'):
2433                         src = track_attributes.get('src')
2434                         if not src:
2435                             continue
2436                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2437                         media_info['subtitles'].setdefault(lang, []).append({
2438                             'url': absolute_url(src),
2439                         })
2440             for f in media_info['formats']:
2441                 f.setdefault('http_headers', {})['Referer'] = base_url
2442             if media_info['formats'] or media_info['subtitles']:
2443                 entries.append(media_info)
2444         return entries
2445
2446     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2447         formats = []
2448         hdcore_sign = 'hdcore=3.7.0'
2449         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2450         hds_host = hosts.get('hds')
2451         if hds_host:
2452             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2453         if 'hdcore=' not in f4m_url:
2454             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2455         f4m_formats = self._extract_f4m_formats(
2456             f4m_url, video_id, f4m_id='hds', fatal=False)
2457         for entry in f4m_formats:
2458             entry.update({'extra_param_to_segment_url': hdcore_sign})
2459         formats.extend(f4m_formats)
2460         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2461         hls_host = hosts.get('hls')
2462         if hls_host:
2463             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2464         formats.extend(self._extract_m3u8_formats(
2465             m3u8_url, video_id, 'mp4', 'm3u8_native',
2466             m3u8_id='hls', fatal=False))
2467         return formats
2468
2469     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2470         query = compat_urlparse.urlparse(url).query
2471         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2472         mobj = re.search(
2473             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2474         url_base = mobj.group('url')
2475         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2476         formats = []
2477
2478         def manifest_url(manifest):
2479             m_url = '%s/%s' % (http_base_url, manifest)
2480             if query:
2481                 m_url += '?%s' % query
2482             return m_url
2483
2484         if 'm3u8' not in skip_protocols:
2485             formats.extend(self._extract_m3u8_formats(
2486                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2487                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2488         if 'f4m' not in skip_protocols:
2489             formats.extend(self._extract_f4m_formats(
2490                 manifest_url('manifest.f4m'),
2491                 video_id, f4m_id='hds', fatal=False))
2492         if 'dash' not in skip_protocols:
2493             formats.extend(self._extract_mpd_formats(
2494                 manifest_url('manifest.mpd'),
2495                 video_id, mpd_id='dash', fatal=False))
2496         if re.search(r'(?:/smil:|\.smil)', url_base):
2497             if 'smil' not in skip_protocols:
2498                 rtmp_formats = self._extract_smil_formats(
2499                     manifest_url('jwplayer.smil'),
2500                     video_id, fatal=False)
2501                 for rtmp_format in rtmp_formats:
2502                     rtsp_format = rtmp_format.copy()
2503                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2504                     del rtsp_format['play_path']
2505                     del rtsp_format['ext']
2506                     rtsp_format.update({
2507                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2508                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2509                         'protocol': 'rtsp',
2510                     })
2511                     formats.extend([rtmp_format, rtsp_format])
2512         else:
2513             for protocol in ('rtmp', 'rtsp'):
2514                 if protocol not in skip_protocols:
2515                     formats.append({
2516                         'url': '%s:%s' % (protocol, url_base),
2517                         'format_id': protocol,
2518                         'protocol': protocol,
2519                     })
2520         return formats
2521
2522     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2523         mobj = re.search(
2524             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2525             webpage)
2526         if mobj:
2527             try:
2528                 jwplayer_data = self._parse_json(mobj.group('options'),
2529                                                  video_id=video_id,
2530                                                  transform_source=transform_source)
2531             except ExtractorError:
2532                 pass
2533             else:
2534                 if isinstance(jwplayer_data, dict):
2535                     return jwplayer_data
2536
2537     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2538         jwplayer_data = self._find_jwplayer_data(
2539             webpage, video_id, transform_source=js_to_json)
2540         return self._parse_jwplayer_data(
2541             jwplayer_data, video_id, *args, **kwargs)
2542
2543     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2544                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2545         # JWPlayer backward compatibility: flattened playlists
2546         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2547         if 'playlist' not in jwplayer_data:
2548             jwplayer_data = {'playlist': [jwplayer_data]}
2549
2550         entries = []
2551
2552         # JWPlayer backward compatibility: single playlist item
2553         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2554         if not isinstance(jwplayer_data['playlist'], list):
2555             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2556
2557         for video_data in jwplayer_data['playlist']:
2558             # JWPlayer backward compatibility: flattened sources
2559             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2560             if 'sources' not in video_data:
2561                 video_data['sources'] = [video_data]
2562
2563             this_video_id = video_id or video_data['mediaid']
2564
2565             formats = self._parse_jwplayer_formats(
2566                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2567                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2568
2569             subtitles = {}
2570             tracks = video_data.get('tracks')
2571             if tracks and isinstance(tracks, list):
2572                 for track in tracks:
2573                     if not isinstance(track, dict):
2574                         continue
2575                     track_kind = track.get('kind')
2576                     if not track_kind or not isinstance(track_kind, compat_str):
2577                         continue
2578                     if track_kind.lower() not in ('captions', 'subtitles'):
2579                         continue
2580                     track_url = urljoin(base_url, track.get('file'))
2581                     if not track_url:
2582                         continue
2583                     subtitles.setdefault(track.get('label') or 'en', []).append({
2584                         'url': self._proto_relative_url(track_url)
2585                     })
2586
2587             entry = {
2588                 'id': this_video_id,
2589                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2590                 'description': video_data.get('description'),
2591                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2592                 'timestamp': int_or_none(video_data.get('pubdate')),
2593                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2594                 'subtitles': subtitles,
2595             }
2596             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2597             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2598                 entry.update({
2599                     '_type': 'url_transparent',
2600                     'url': formats[0]['url'],
2601                 })
2602             else:
2603                 self._sort_formats(formats)
2604                 entry['formats'] = formats
2605             entries.append(entry)
2606         if len(entries) == 1:
2607             return entries[0]
2608         else:
2609             return self.playlist_result(entries)
2610
2611     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2612                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2613         urls = []
2614         formats = []
2615         for source in jwplayer_sources_data:
2616             if not isinstance(source, dict):
2617                 continue
2618             source_url = self._proto_relative_url(source.get('file'))
2619             if not source_url:
2620                 continue
2621             if base_url:
2622                 source_url = compat_urlparse.urljoin(base_url, source_url)
2623             if source_url in urls:
2624                 continue
2625             urls.append(source_url)
2626             source_type = source.get('type') or ''
2627             ext = mimetype2ext(source_type) or determine_ext(source_url)
2628             if source_type == 'hls' or ext == 'm3u8':
2629                 formats.extend(self._extract_m3u8_formats(
2630                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2631                     m3u8_id=m3u8_id, fatal=False))
2632             elif source_type == 'dash' or ext == 'mpd':
2633                 formats.extend(self._extract_mpd_formats(
2634                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2635             elif ext == 'smil':
2636                 formats.extend(self._extract_smil_formats(
2637                     source_url, video_id, fatal=False))
2638             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2639             elif source_type.startswith('audio') or ext in (
2640                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2641                 formats.append({
2642                     'url': source_url,
2643                     'vcodec': 'none',
2644                     'ext': ext,
2645                 })
2646             else:
2647                 height = int_or_none(source.get('height'))
2648                 if height is None:
2649                     # Often no height is provided but there is a label in
2650                     # format like "1080p", "720p SD", or 1080.
2651                     height = int_or_none(self._search_regex(
2652                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2653                         'height', default=None))
2654                 a_format = {
2655                     'url': source_url,
2656                     'width': int_or_none(source.get('width')),
2657                     'height': height,
2658                     'tbr': int_or_none(source.get('bitrate')),
2659                     'ext': ext,
2660                 }
2661                 if source_url.startswith('rtmp'):
2662                     a_format['ext'] = 'flv'
2663                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2664                     # of jwplayer.flash.swf
2665                     rtmp_url_parts = re.split(
2666                         r'((?:mp4|mp3|flv):)', source_url, 1)
2667                     if len(rtmp_url_parts) == 3:
2668                         rtmp_url, prefix, play_path = rtmp_url_parts
2669                         a_format.update({
2670                             'url': rtmp_url,
2671                             'play_path': prefix + play_path,
2672                         })
2673                     if rtmp_params:
2674                         a_format.update(rtmp_params)
2675                 formats.append(a_format)
2676         return formats
2677
2678     def _live_title(self, name):
2679         """ Generate the title for a live video """
2680         now = datetime.datetime.now()
2681         now_str = now.strftime('%Y-%m-%d %H:%M')
2682         return name + ' ' + now_str
2683
2684     def _int(self, v, name, fatal=False, **kwargs):
2685         res = int_or_none(v, **kwargs)
2686         if 'get_attr' in kwargs:
2687             print(getattr(v, kwargs['get_attr']))
2688         if res is None:
2689             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2690             if fatal:
2691                 raise ExtractorError(msg)
2692             else:
2693                 self._downloader.report_warning(msg)
2694         return res
2695
2696     def _float(self, v, name, fatal=False, **kwargs):
2697         res = float_or_none(v, **kwargs)
2698         if res is None:
2699             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2700             if fatal:
2701                 raise ExtractorError(msg)
2702             else:
2703                 self._downloader.report_warning(msg)
2704         return res
2705
2706     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2707                     path='/', secure=False, discard=False, rest={}, **kwargs):
2708         cookie = compat_cookiejar.Cookie(
2709             0, name, value, port, port is not None, domain, True,
2710             domain.startswith('.'), path, True, secure, expire_time,
2711             discard, None, None, rest)
2712         self._downloader.cookiejar.set_cookie(cookie)
2713
2714     def _get_cookies(self, url):
2715         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2716         req = sanitized_Request(url)
2717         self._downloader.cookiejar.add_cookie_header(req)
2718         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2719
2720     def get_testcases(self, include_onlymatching=False):
2721         t = getattr(self, '_TEST', None)
2722         if t:
2723             assert not hasattr(self, '_TESTS'), \
2724                 '%s has _TEST and _TESTS' % type(self).__name__
2725             tests = [t]
2726         else:
2727             tests = getattr(self, '_TESTS', [])
2728         for t in tests:
2729             if not include_onlymatching and t.get('only_matching', False):
2730                 continue
2731             t['name'] = type(self).__name__[:-len('IE')]
2732             yield t
2733
2734     def is_suitable(self, age_limit):
2735         """ Test whether the extractor is generally suitable for the given
2736         age limit (i.e. pornographic sites are not, all others usually are) """
2737
2738         any_restricted = False
2739         for tc in self.get_testcases(include_onlymatching=False):
2740             if tc.get('playlist', []):
2741                 tc = tc['playlist'][0]
2742             is_restricted = age_restricted(
2743                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2744             if not is_restricted:
2745                 return True
2746             any_restricted = any_restricted or is_restricted
2747         return not any_restricted
2748
2749     def extract_subtitles(self, *args, **kwargs):
2750         if (self._downloader.params.get('writesubtitles', False) or
2751                 self._downloader.params.get('listsubtitles')):
2752             return self._get_subtitles(*args, **kwargs)
2753         return {}
2754
2755     def _get_subtitles(self, *args, **kwargs):
2756         raise NotImplementedError('This method must be implemented by subclasses')
2757
2758     @staticmethod
2759     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2760         """ Merge subtitle items for one language. Items with duplicated URLs
2761         will be dropped. """
2762         list1_urls = set([item['url'] for item in subtitle_list1])
2763         ret = list(subtitle_list1)
2764         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2765         return ret
2766
2767     @classmethod
2768     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2769         """ Merge two subtitle dictionaries, language by language. """
2770         ret = dict(subtitle_dict1)
2771         for lang in subtitle_dict2:
2772             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2773         return ret
2774
2775     def extract_automatic_captions(self, *args, **kwargs):
2776         if (self._downloader.params.get('writeautomaticsub', False) or
2777                 self._downloader.params.get('listsubtitles')):
2778             return self._get_automatic_captions(*args, **kwargs)
2779         return {}
2780
2781     def _get_automatic_captions(self, *args, **kwargs):
2782         raise NotImplementedError('This method must be implemented by subclasses')
2783
2784     def mark_watched(self, *args, **kwargs):
2785         if (self._downloader.params.get('mark_watched', False) and
2786                 (self._get_login_info()[0] is not None or
2787                     self._downloader.params.get('cookiefile') is not None)):
2788             self._mark_watched(*args, **kwargs)
2789
2790     def _mark_watched(self, *args, **kwargs):
2791         raise NotImplementedError('This method must be implemented by subclasses')
2792
2793     def geo_verification_headers(self):
2794         headers = {}
2795         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2796         if geo_verification_proxy:
2797             headers['Ytdl-request-proxy'] = geo_verification_proxy
2798         return headers
2799
2800     def _generic_id(self, url):
2801         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2802
2803     def _generic_title(self, url):
2804         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2805
2806
2807 class SearchInfoExtractor(InfoExtractor):
2808     """
2809     Base class for paged search queries extractors.
2810     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2811     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2812     """
2813
2814     @classmethod
2815     def _make_valid_url(cls):
2816         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2817
2818     @classmethod
2819     def suitable(cls, url):
2820         return re.match(cls._make_valid_url(), url) is not None
2821
2822     def _real_extract(self, query):
2823         mobj = re.match(self._make_valid_url(), query)
2824         if mobj is None:
2825             raise ExtractorError('Invalid search query "%s"' % query)
2826
2827         prefix = mobj.group('prefix')
2828         query = mobj.group('query')
2829         if prefix == '':
2830             return self._get_n_results(query, 1)
2831         elif prefix == 'all':
2832             return self._get_n_results(query, self._MAX_RESULTS)
2833         else:
2834             n = int(prefix)
2835             if n <= 0:
2836                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2837             elif n > self._MAX_RESULTS:
2838                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2839                 n = self._MAX_RESULTS
2840             return self._get_n_results(query, n)
2841
2842     def _get_n_results(self, query, n):
2843         """Get a specified number of results for a query"""
2844         raise NotImplementedError('This method must be implemented by subclasses')
2845
2846     @property
2847     def SEARCH_KEY(self):
2848         return self._SEARCH_KEY