]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py
[bitchute] Fix extraction by pass custom User-Agent
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_integer_types,
23     compat_http_client,
24     compat_os_name,
25     compat_str,
26     compat_urllib_error,
27     compat_urllib_parse_unquote,
28     compat_urllib_parse_urlencode,
29     compat_urllib_request,
30     compat_urlparse,
31     compat_xml_parse_error,
32 )
33 from ..downloader.f4m import (
34     get_base_url,
35     remove_encrypted_media,
36 )
37 from ..utils import (
38     NO_DEFAULT,
39     age_restricted,
40     base_url,
41     bug_reports_message,
42     clean_html,
43     compiled_regex_type,
44     determine_ext,
45     determine_protocol,
46     error_to_compat_str,
47     ExtractorError,
48     extract_attributes,
49     fix_xml_ampersands,
50     float_or_none,
51     GeoRestrictedError,
52     GeoUtils,
53     int_or_none,
54     js_to_json,
55     JSON_LD_RE,
56     mimetype2ext,
57     orderedSet,
58     parse_codecs,
59     parse_duration,
60     parse_iso8601,
61     parse_m3u8_attributes,
62     RegexNotFoundError,
63     sanitized_Request,
64     sanitize_filename,
65     unescapeHTML,
66     unified_strdate,
67     unified_timestamp,
68     update_Request,
69     update_url_query,
70     urljoin,
71     url_basename,
72     xpath_element,
73     xpath_text,
74     xpath_with_ns,
75 )
76
77
78 class InfoExtractor(object):
79     """Information Extractor class.
80
81     Information extractors are the classes that, given a URL, extract
82     information about the video (or videos) the URL refers to. This
83     information includes the real video URL, the video title, author and
84     others. The information is stored in a dictionary which is then
85     passed to the YoutubeDL. The YoutubeDL processes this
86     information possibly downloading the video to the file system, among
87     other possible outcomes.
88
89     The type field determines the type of the result.
90     By far the most common value (and the default if _type is missing) is
91     "video", which indicates a single video.
92
93     For a video, the dictionaries must include the following fields:
94
95     id:             Video identifier.
96     title:          Video title, unescaped.
97
98     Additionally, it must contain either a formats entry or a url one:
99
100     formats:        A list of dictionaries for each format available, ordered
101                     from worst to best quality.
102
103                     Potential fields:
104                     * url        Mandatory. The URL of the video file
105                     * manifest_url
106                                  The URL of the manifest file in case of
107                                  fragmented media (DASH, hls, hds)
108                     * ext        Will be calculated from URL if missing
109                     * format     A human-readable description of the format
110                                  ("mp4 container with h264/opus").
111                                  Calculated from the format_id, width, height.
112                                  and format_note fields if missing.
113                     * format_id  A short description of the format
114                                  ("mp4_h264_opus" or "19").
115                                 Technically optional, but strongly recommended.
116                     * format_note Additional info about the format
117                                  ("3D" or "DASH video")
118                     * width      Width of the video, if known
119                     * height     Height of the video, if known
120                     * resolution Textual description of width and height
121                     * tbr        Average bitrate of audio and video in KBit/s
122                     * abr        Average audio bitrate in KBit/s
123                     * acodec     Name of the audio codec in use
124                     * asr        Audio sampling rate in Hertz
125                     * vbr        Average video bitrate in KBit/s
126                     * fps        Frame rate
127                     * vcodec     Name of the video codec in use
128                     * container  Name of the container format
129                     * filesize   The number of bytes, if known in advance
130                     * filesize_approx  An estimate for the number of bytes
131                     * player_url SWF Player URL (used for rtmpdump).
132                     * protocol   The protocol that will be used for the actual
133                                  download, lower-case.
134                                  "http", "https", "rtsp", "rtmp", "rtmpe",
135                                  "m3u8", "m3u8_native" or "http_dash_segments".
136                     * fragment_base_url
137                                  Base URL for fragments. Each fragment's path
138                                  value (if present) will be relative to
139                                  this URL.
140                     * fragments  A list of fragments of a fragmented media.
141                                  Each fragment entry must contain either an url
142                                  or a path. If an url is present it should be
143                                  considered by a client. Otherwise both path and
144                                  fragment_base_url must be present. Here is
145                                  the list of all potential fields:
146                                  * "url" - fragment's URL
147                                  * "path" - fragment's path relative to
148                                             fragment_base_url
149                                  * "duration" (optional, int or float)
150                                  * "filesize" (optional, int)
151                     * preference Order number of this format. If this field is
152                                  present and not None, the formats get sorted
153                                  by this field, regardless of all other values.
154                                  -1 for default (order by other properties),
155                                  -2 or smaller for less than default.
156                                  < -1000 to hide the format (if there is
157                                     another one which is strictly better)
158                     * language   Language code, e.g. "de" or "en-US".
159                     * language_preference  Is this in the language mentioned in
160                                  the URL?
161                                  10 if it's what the URL is about,
162                                  -1 for default (don't know),
163                                  -10 otherwise, other values reserved for now.
164                     * quality    Order number of the video quality of this
165                                  format, irrespective of the file format.
166                                  -1 for default (order by other properties),
167                                  -2 or smaller for less than default.
168                     * source_preference  Order number for this video source
169                                   (quality takes higher priority)
170                                  -1 for default (order by other properties),
171                                  -2 or smaller for less than default.
172                     * http_headers  A dictionary of additional HTTP headers
173                                  to add to the request.
174                     * stretched_ratio  If given and not 1, indicates that the
175                                  video's pixels are not square.
176                                  width : height ratio as float.
177                     * no_resume  The server does not support resuming the
178                                  (HTTP or RTMP) download. Boolean.
179                     * downloader_options  A dictionary of downloader options as
180                                  described in FileDownloader
181
182     url:            Final video URL.
183     ext:            Video filename extension.
184     format:         The video format, defaults to ext (used for --get-format)
185     player_url:     SWF Player URL (used for rtmpdump).
186
187     The following fields are optional:
188
189     alt_title:      A secondary title of the video.
190     display_id      An alternative identifier for the video, not necessarily
191                     unique, but available before title. Typically, id is
192                     something like "4234987", title "Dancing naked mole rats",
193                     and display_id "dancing-naked-mole-rats"
194     thumbnails:     A list of dictionaries, with the following entries:
195                         * "id" (optional, string) - Thumbnail format ID
196                         * "url"
197                         * "preference" (optional, int) - quality of the image
198                         * "width" (optional, int)
199                         * "height" (optional, int)
200                         * "resolution" (optional, string "{width}x{height"},
201                                         deprecated)
202                         * "filesize" (optional, int)
203     thumbnail:      Full URL to a video thumbnail image.
204     description:    Full video description.
205     uploader:       Full name of the video uploader.
206     license:        License name the video is licensed under.
207     creator:        The creator of the video.
208     release_date:   The date (YYYYMMDD) when the video was released.
209     timestamp:      UNIX timestamp of the moment the video became available.
210     upload_date:    Video upload date (YYYYMMDD).
211                     If not explicitly set, calculated from timestamp.
212     uploader_id:    Nickname or id of the video uploader.
213     uploader_url:   Full URL to a personal webpage of the video uploader.
214     location:       Physical location where the video was filmed.
215     subtitles:      The available subtitles as a dictionary in the format
216                     {tag: subformats}. "tag" is usually a language code, and
217                     "subformats" is a list sorted from lower to higher
218                     preference, each element is a dictionary with the "ext"
219                     entry and one of:
220                         * "data": The subtitles file contents
221                         * "url": A URL pointing to the subtitles file
222                     "ext" will be calculated from URL if missing
223     automatic_captions: Like 'subtitles', used by the YoutubeIE for
224                     automatically generated captions
225     duration:       Length of the video in seconds, as an integer or float.
226     view_count:     How many users have watched the video on the platform.
227     like_count:     Number of positive ratings of the video
228     dislike_count:  Number of negative ratings of the video
229     repost_count:   Number of reposts of the video
230     average_rating: Average rating give by users, the scale used depends on the webpage
231     comment_count:  Number of comments on the video
232     comments:       A list of comments, each with one or more of the following
233                     properties (all but one of text or html optional):
234                         * "author" - human-readable name of the comment author
235                         * "author_id" - user ID of the comment author
236                         * "id" - Comment ID
237                         * "html" - Comment as HTML
238                         * "text" - Plain text of the comment
239                         * "timestamp" - UNIX timestamp of comment
240                         * "parent" - ID of the comment this one is replying to.
241                                      Set to "root" to indicate that this is a
242                                      comment to the original video.
243     age_limit:      Age restriction for the video, as an integer (years)
244     webpage_url:    The URL to the video webpage, if given to youtube-dl it
245                     should allow to get the same result again. (It will be set
246                     by YoutubeDL if it's missing)
247     categories:     A list of categories that the video falls in, for example
248                     ["Sports", "Berlin"]
249     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
250     is_live:        True, False, or None (=unknown). Whether this video is a
251                     live stream that goes on instead of a fixed-length video.
252     start_time:     Time in seconds where the reproduction should start, as
253                     specified in the URL.
254     end_time:       Time in seconds where the reproduction should end, as
255                     specified in the URL.
256     chapters:       A list of dictionaries, with the following entries:
257                         * "start_time" - The start time of the chapter in seconds
258                         * "end_time" - The end time of the chapter in seconds
259                         * "title" (optional, string)
260
261     The following fields should only be used when the video belongs to some logical
262     chapter or section:
263
264     chapter:        Name or title of the chapter the video belongs to.
265     chapter_number: Number of the chapter the video belongs to, as an integer.
266     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
267
268     The following fields should only be used when the video is an episode of some
269     series, programme or podcast:
270
271     series:         Title of the series or programme the video episode belongs to.
272     season:         Title of the season the video episode belongs to.
273     season_number:  Number of the season the video episode belongs to, as an integer.
274     season_id:      Id of the season the video episode belongs to, as a unicode string.
275     episode:        Title of the video episode. Unlike mandatory video title field,
276                     this field should denote the exact title of the video episode
277                     without any kind of decoration.
278     episode_number: Number of the video episode within a season, as an integer.
279     episode_id:     Id of the video episode, as a unicode string.
280
281     The following fields should only be used when the media is a track or a part of
282     a music album:
283
284     track:          Title of the track.
285     track_number:   Number of the track within an album or a disc, as an integer.
286     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
287                     as a unicode string.
288     artist:         Artist(s) of the track.
289     genre:          Genre(s) of the track.
290     album:          Title of the album the track belongs to.
291     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
292     album_artist:   List of all artists appeared on the album (e.g.
293                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
294                     and compilations).
295     disc_number:    Number of the disc or other physical medium the track belongs to,
296                     as an integer.
297     release_year:   Year (YYYY) when the album was released.
298
299     Unless mentioned otherwise, the fields should be Unicode strings.
300
301     Unless mentioned otherwise, None is equivalent to absence of information.
302
303
304     _type "playlist" indicates multiple videos.
305     There must be a key "entries", which is a list, an iterable, or a PagedList
306     object, each element of which is a valid dictionary by this specification.
307
308     Additionally, playlists can have "id", "title", "description", "uploader",
309     "uploader_id", "uploader_url" attributes with the same semantics as videos
310     (see above).
311
312
313     _type "multi_video" indicates that there are multiple videos that
314     form a single show, for examples multiple acts of an opera or TV episode.
315     It must have an entries key like a playlist and contain all the keys
316     required for a video at the same time.
317
318
319     _type "url" indicates that the video must be extracted from another
320     location, possibly by a different extractor. Its only required key is:
321     "url" - the next URL to extract.
322     The key "ie_key" can be set to the class name (minus the trailing "IE",
323     e.g. "Youtube") if the extractor class is known in advance.
324     Additionally, the dictionary may have any properties of the resolved entity
325     known in advance, for example "title" if the title of the referred video is
326     known ahead of time.
327
328
329     _type "url_transparent" entities have the same specification as "url", but
330     indicate that the given additional information is more precise than the one
331     associated with the resolved URL.
332     This is useful when a site employs a video service that hosts the video and
333     its technical metadata, but that video service does not embed a useful
334     title, description etc.
335
336
337     Subclasses of this one should re-define the _real_initialize() and
338     _real_extract() methods and define a _VALID_URL regexp.
339     Probably, they should also be added to the list of extractors.
340
341     _GEO_BYPASS attribute may be set to False in order to disable
342     geo restriction bypass mechanisms for a particular extractor.
343     Though it won't disable explicit geo restriction bypass based on
344     country code provided with geo_bypass_country.
345
346     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
347     countries for this extractor. One of these countries will be used by
348     geo restriction bypass mechanism right away in order to bypass
349     geo restriction, of course, if the mechanism is not disabled.
350
351     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
352     IP blocks in CIDR notation for this extractor. One of these IP blocks
353     will be used by geo restriction bypass mechanism similarly
354     to _GEO_COUNTRIES.
355
356     Finally, the _WORKING attribute should be set to False for broken IEs
357     in order to warn the users and skip the tests.
358     """
359
360     _ready = False
361     _downloader = None
362     _x_forwarded_for_ip = None
363     _GEO_BYPASS = True
364     _GEO_COUNTRIES = None
365     _GEO_IP_BLOCKS = None
366     _WORKING = True
367
368     def __init__(self, downloader=None):
369         """Constructor. Receives an optional downloader."""
370         self._ready = False
371         self._x_forwarded_for_ip = None
372         self.set_downloader(downloader)
373
374     @classmethod
375     def suitable(cls, url):
376         """Receives a URL and returns True if suitable for this IE."""
377
378         # This does not use has/getattr intentionally - we want to know whether
379         # we have cached the regexp for *this* class, whereas getattr would also
380         # match the superclass
381         if '_VALID_URL_RE' not in cls.__dict__:
382             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
383         return cls._VALID_URL_RE.match(url) is not None
384
385     @classmethod
386     def _match_id(cls, url):
387         if '_VALID_URL_RE' not in cls.__dict__:
388             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
389         m = cls._VALID_URL_RE.match(url)
390         assert m
391         return compat_str(m.group('id'))
392
393     @classmethod
394     def working(cls):
395         """Getter method for _WORKING."""
396         return cls._WORKING
397
398     def initialize(self):
399         """Initializes an instance (authentication, etc)."""
400         self._initialize_geo_bypass({
401             'countries': self._GEO_COUNTRIES,
402             'ip_blocks': self._GEO_IP_BLOCKS,
403         })
404         if not self._ready:
405             self._real_initialize()
406             self._ready = True
407
408     def _initialize_geo_bypass(self, geo_bypass_context):
409         """
410         Initialize geo restriction bypass mechanism.
411
412         This method is used to initialize geo bypass mechanism based on faking
413         X-Forwarded-For HTTP header. A random country from provided country list
414         is selected and a random IP belonging to this country is generated. This
415         IP will be passed as X-Forwarded-For HTTP header in all subsequent
416         HTTP requests.
417
418         This method will be used for initial geo bypass mechanism initialization
419         during the instance initialization with _GEO_COUNTRIES and
420         _GEO_IP_BLOCKS.
421
422         You may also manually call it from extractor's code if geo bypass
423         information is not available beforehand (e.g. obtained during
424         extraction) or due to some other reason. In this case you should pass
425         this information in geo bypass context passed as first argument. It may
426         contain following fields:
427
428         countries:  List of geo unrestricted countries (similar
429                     to _GEO_COUNTRIES)
430         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
431                     (similar to _GEO_IP_BLOCKS)
432
433         """
434         if not self._x_forwarded_for_ip:
435
436             # Geo bypass mechanism is explicitly disabled by user
437             if not self._downloader.params.get('geo_bypass', True):
438                 return
439
440             if not geo_bypass_context:
441                 geo_bypass_context = {}
442
443             # Backward compatibility: previously _initialize_geo_bypass
444             # expected a list of countries, some 3rd party code may still use
445             # it this way
446             if isinstance(geo_bypass_context, (list, tuple)):
447                 geo_bypass_context = {
448                     'countries': geo_bypass_context,
449                 }
450
451             # The whole point of geo bypass mechanism is to fake IP
452             # as X-Forwarded-For HTTP header based on some IP block or
453             # country code.
454
455             # Path 1: bypassing based on IP block in CIDR notation
456
457             # Explicit IP block specified by user, use it right away
458             # regardless of whether extractor is geo bypassable or not
459             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
460
461             # Otherwise use random IP block from geo bypass context but only
462             # if extractor is known as geo bypassable
463             if not ip_block:
464                 ip_blocks = geo_bypass_context.get('ip_blocks')
465                 if self._GEO_BYPASS and ip_blocks:
466                     ip_block = random.choice(ip_blocks)
467
468             if ip_block:
469                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
470                 if self._downloader.params.get('verbose', False):
471                     self._downloader.to_screen(
472                         '[debug] Using fake IP %s as X-Forwarded-For.'
473                         % self._x_forwarded_for_ip)
474                 return
475
476             # Path 2: bypassing based on country code
477
478             # Explicit country code specified by user, use it right away
479             # regardless of whether extractor is geo bypassable or not
480             country = self._downloader.params.get('geo_bypass_country', None)
481
482             # Otherwise use random country code from geo bypass context but
483             # only if extractor is known as geo bypassable
484             if not country:
485                 countries = geo_bypass_context.get('countries')
486                 if self._GEO_BYPASS and countries:
487                     country = random.choice(countries)
488
489             if country:
490                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
491                 if self._downloader.params.get('verbose', False):
492                     self._downloader.to_screen(
493                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
494                         % (self._x_forwarded_for_ip, country.upper()))
495
496     def extract(self, url):
497         """Extracts URL information and returns it in list of dicts."""
498         try:
499             for _ in range(2):
500                 try:
501                     self.initialize()
502                     ie_result = self._real_extract(url)
503                     if self._x_forwarded_for_ip:
504                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
505                     return ie_result
506                 except GeoRestrictedError as e:
507                     if self.__maybe_fake_ip_and_retry(e.countries):
508                         continue
509                     raise
510         except ExtractorError:
511             raise
512         except compat_http_client.IncompleteRead as e:
513             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
514         except (KeyError, StopIteration) as e:
515             raise ExtractorError('An extractor error has occurred.', cause=e)
516
517     def __maybe_fake_ip_and_retry(self, countries):
518         if (not self._downloader.params.get('geo_bypass_country', None) and
519                 self._GEO_BYPASS and
520                 self._downloader.params.get('geo_bypass', True) and
521                 not self._x_forwarded_for_ip and
522                 countries):
523             country_code = random.choice(countries)
524             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
525             if self._x_forwarded_for_ip:
526                 self.report_warning(
527                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
528                     % (self._x_forwarded_for_ip, country_code.upper()))
529                 return True
530         return False
531
532     def set_downloader(self, downloader):
533         """Sets the downloader for this IE."""
534         self._downloader = downloader
535
536     def _real_initialize(self):
537         """Real initialization process. Redefine in subclasses."""
538         pass
539
540     def _real_extract(self, url):
541         """Real extraction process. Redefine in subclasses."""
542         pass
543
544     @classmethod
545     def ie_key(cls):
546         """A string for getting the InfoExtractor with get_info_extractor"""
547         return compat_str(cls.__name__[:-2])
548
549     @property
550     def IE_NAME(self):
551         return compat_str(type(self).__name__[:-2])
552
553     @staticmethod
554     def __can_accept_status_code(err, expected_status):
555         assert isinstance(err, compat_urllib_error.HTTPError)
556         if expected_status is None:
557             return False
558         if isinstance(expected_status, compat_integer_types):
559             return err.code == expected_status
560         elif isinstance(expected_status, (list, tuple)):
561             return err.code in expected_status
562         elif callable(expected_status):
563             return expected_status(err.code) is True
564         else:
565             assert False
566
567     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
568         """
569         Return the response handle.
570
571         See _download_webpage docstring for arguments specification.
572         """
573         if note is None:
574             self.report_download_webpage(video_id)
575         elif note is not False:
576             if video_id is None:
577                 self.to_screen('%s' % (note,))
578             else:
579                 self.to_screen('%s: %s' % (video_id, note))
580
581         # Some sites check X-Forwarded-For HTTP header in order to figure out
582         # the origin of the client behind proxy. This allows bypassing geo
583         # restriction by faking this header's value to IP that belongs to some
584         # geo unrestricted country. We will do so once we encounter any
585         # geo restriction error.
586         if self._x_forwarded_for_ip:
587             if 'X-Forwarded-For' not in headers:
588                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
589
590         if isinstance(url_or_request, compat_urllib_request.Request):
591             url_or_request = update_Request(
592                 url_or_request, data=data, headers=headers, query=query)
593         else:
594             if query:
595                 url_or_request = update_url_query(url_or_request, query)
596             if data is not None or headers:
597                 url_or_request = sanitized_Request(url_or_request, data, headers)
598         try:
599             return self._downloader.urlopen(url_or_request)
600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
601             if isinstance(err, compat_urllib_error.HTTPError):
602                 if self.__can_accept_status_code(err, expected_status):
603                     return err.fp
604
605             if errnote is False:
606                 return False
607             if errnote is None:
608                 errnote = 'Unable to download webpage'
609
610             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
611             if fatal:
612                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
613             else:
614                 self._downloader.report_warning(errmsg)
615                 return False
616
617     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
618         """
619         Return a tuple (page content as string, URL handle).
620
621         See _download_webpage docstring for arguments specification.
622         """
623         # Strip hashes from the URL (#1038)
624         if isinstance(url_or_request, (compat_str, str)):
625             url_or_request = url_or_request.partition('#')[0]
626
627         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
628         if urlh is False:
629             assert not fatal
630             return False
631         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
632         return (content, urlh)
633
634     @staticmethod
635     def _guess_encoding_from_content(content_type, webpage_bytes):
636         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
637         if m:
638             encoding = m.group(1)
639         else:
640             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
641                           webpage_bytes[:1024])
642             if m:
643                 encoding = m.group(1).decode('ascii')
644             elif webpage_bytes.startswith(b'\xff\xfe'):
645                 encoding = 'utf-16'
646             else:
647                 encoding = 'utf-8'
648
649         return encoding
650
651     def __check_blocked(self, content):
652         first_block = content[:512]
653         if ('<title>Access to this site is blocked</title>' in content and
654                 'Websense' in first_block):
655             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
656             blocked_iframe = self._html_search_regex(
657                 r'<iframe src="([^"]+)"', content,
658                 'Websense information URL', default=None)
659             if blocked_iframe:
660                 msg += ' Visit %s for more details' % blocked_iframe
661             raise ExtractorError(msg, expected=True)
662         if '<title>The URL you requested has been blocked</title>' in first_block:
663             msg = (
664                 'Access to this webpage has been blocked by Indian censorship. '
665                 'Use a VPN or proxy server (with --proxy) to route around it.')
666             block_msg = self._html_search_regex(
667                 r'</h1><p>(.*?)</p>',
668                 content, 'block message', default=None)
669             if block_msg:
670                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
671             raise ExtractorError(msg, expected=True)
672         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
673                 'blocklist.rkn.gov.ru' in content):
674             raise ExtractorError(
675                 'Access to this webpage has been blocked by decision of the Russian government. '
676                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
677                 expected=True)
678
679     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
680         content_type = urlh.headers.get('Content-Type', '')
681         webpage_bytes = urlh.read()
682         if prefix is not None:
683             webpage_bytes = prefix + webpage_bytes
684         if not encoding:
685             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
686         if self._downloader.params.get('dump_intermediate_pages', False):
687             self.to_screen('Dumping request to ' + urlh.geturl())
688             dump = base64.b64encode(webpage_bytes).decode('ascii')
689             self._downloader.to_screen(dump)
690         if self._downloader.params.get('write_pages', False):
691             basen = '%s_%s' % (video_id, urlh.geturl())
692             if len(basen) > 240:
693                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
694                 basen = basen[:240 - len(h)] + h
695             raw_filename = basen + '.dump'
696             filename = sanitize_filename(raw_filename, restricted=True)
697             self.to_screen('Saving request to ' + filename)
698             # Working around MAX_PATH limitation on Windows (see
699             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
700             if compat_os_name == 'nt':
701                 absfilepath = os.path.abspath(filename)
702                 if len(absfilepath) > 259:
703                     filename = '\\\\?\\' + absfilepath
704             with open(filename, 'wb') as outf:
705                 outf.write(webpage_bytes)
706
707         try:
708             content = webpage_bytes.decode(encoding, 'replace')
709         except LookupError:
710             content = webpage_bytes.decode('utf-8', 'replace')
711
712         self.__check_blocked(content)
713
714         return content
715
716     def _download_webpage(
717             self, url_or_request, video_id, note=None, errnote=None,
718             fatal=True, tries=1, timeout=5, encoding=None, data=None,
719             headers={}, query={}, expected_status=None):
720         """
721         Return the data of the page as a string.
722
723         Arguments:
724         url_or_request -- plain text URL as a string or
725             a compat_urllib_request.Requestobject
726         video_id -- Video/playlist/item identifier (string)
727
728         Keyword arguments:
729         note -- note printed before downloading (string)
730         errnote -- note printed in case of an error (string)
731         fatal -- flag denoting whether error should be considered fatal,
732             i.e. whether it should cause ExtractionError to be raised,
733             otherwise a warning will be reported and extraction continued
734         tries -- number of tries
735         timeout -- sleep interval between tries
736         encoding -- encoding for a page content decoding, guessed automatically
737             when not explicitly specified
738         data -- POST data (bytes)
739         headers -- HTTP headers (dict)
740         query -- URL query (dict)
741         expected_status -- allows to accept failed HTTP requests (non 2xx
742             status code) by explicitly specifying a set of accepted status
743             codes. Can be any of the following entities:
744                 - an integer type specifying an exact failed status code to
745                   accept
746                 - a list or a tuple of integer types specifying a list of
747                   failed status codes to accept
748                 - a callable accepting an actual failed status code and
749                   returning True if it should be accepted
750             Note that this argument does not affect success status codes (2xx)
751             which are always accepted.
752         """
753
754         success = False
755         try_count = 0
756         while success is False:
757             try:
758                 res = self._download_webpage_handle(
759                     url_or_request, video_id, note, errnote, fatal,
760                     encoding=encoding, data=data, headers=headers, query=query,
761                     expected_status=expected_status)
762                 success = True
763             except compat_http_client.IncompleteRead as e:
764                 try_count += 1
765                 if try_count >= tries:
766                     raise e
767                 self._sleep(timeout, video_id)
768         if res is False:
769             return res
770         else:
771             content, _ = res
772             return content
773
774     def _download_xml_handle(
775             self, url_or_request, video_id, note='Downloading XML',
776             errnote='Unable to download XML', transform_source=None,
777             fatal=True, encoding=None, data=None, headers={}, query={},
778             expected_status=None):
779         """
780         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
781
782         See _download_webpage docstring for arguments specification.
783         """
784         res = self._download_webpage_handle(
785             url_or_request, video_id, note, errnote, fatal=fatal,
786             encoding=encoding, data=data, headers=headers, query=query,
787             expected_status=expected_status)
788         if res is False:
789             return res
790         xml_string, urlh = res
791         return self._parse_xml(
792             xml_string, video_id, transform_source=transform_source,
793             fatal=fatal), urlh
794
795     def _download_xml(
796             self, url_or_request, video_id,
797             note='Downloading XML', errnote='Unable to download XML',
798             transform_source=None, fatal=True, encoding=None,
799             data=None, headers={}, query={}, expected_status=None):
800         """
801         Return the xml as an xml.etree.ElementTree.Element.
802
803         See _download_webpage docstring for arguments specification.
804         """
805         res = self._download_xml_handle(
806             url_or_request, video_id, note=note, errnote=errnote,
807             transform_source=transform_source, fatal=fatal, encoding=encoding,
808             data=data, headers=headers, query=query,
809             expected_status=expected_status)
810         return res if res is False else res[0]
811
812     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
813         if transform_source:
814             xml_string = transform_source(xml_string)
815         try:
816             return compat_etree_fromstring(xml_string.encode('utf-8'))
817         except compat_xml_parse_error as ve:
818             errmsg = '%s: Failed to parse XML ' % video_id
819             if fatal:
820                 raise ExtractorError(errmsg, cause=ve)
821             else:
822                 self.report_warning(errmsg + str(ve))
823
824     def _download_json_handle(
825             self, url_or_request, video_id, note='Downloading JSON metadata',
826             errnote='Unable to download JSON metadata', transform_source=None,
827             fatal=True, encoding=None, data=None, headers={}, query={},
828             expected_status=None):
829         """
830         Return a tuple (JSON object, URL handle).
831
832         See _download_webpage docstring for arguments specification.
833         """
834         res = self._download_webpage_handle(
835             url_or_request, video_id, note, errnote, fatal=fatal,
836             encoding=encoding, data=data, headers=headers, query=query,
837             expected_status=expected_status)
838         if res is False:
839             return res
840         json_string, urlh = res
841         return self._parse_json(
842             json_string, video_id, transform_source=transform_source,
843             fatal=fatal), urlh
844
845     def _download_json(
846             self, url_or_request, video_id, note='Downloading JSON metadata',
847             errnote='Unable to download JSON metadata', transform_source=None,
848             fatal=True, encoding=None, data=None, headers={}, query={},
849             expected_status=None):
850         """
851         Return the JSON object as a dict.
852
853         See _download_webpage docstring for arguments specification.
854         """
855         res = self._download_json_handle(
856             url_or_request, video_id, note=note, errnote=errnote,
857             transform_source=transform_source, fatal=fatal, encoding=encoding,
858             data=data, headers=headers, query=query,
859             expected_status=expected_status)
860         return res if res is False else res[0]
861
862     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
863         if transform_source:
864             json_string = transform_source(json_string)
865         try:
866             return json.loads(json_string)
867         except ValueError as ve:
868             errmsg = '%s: Failed to parse JSON ' % video_id
869             if fatal:
870                 raise ExtractorError(errmsg, cause=ve)
871             else:
872                 self.report_warning(errmsg + str(ve))
873
874     def report_warning(self, msg, video_id=None):
875         idstr = '' if video_id is None else '%s: ' % video_id
876         self._downloader.report_warning(
877             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
878
879     def to_screen(self, msg):
880         """Print msg to screen, prefixing it with '[ie_name]'"""
881         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
882
883     def report_extraction(self, id_or_name):
884         """Report information extraction."""
885         self.to_screen('%s: Extracting information' % id_or_name)
886
887     def report_download_webpage(self, video_id):
888         """Report webpage download."""
889         self.to_screen('%s: Downloading webpage' % video_id)
890
891     def report_age_confirmation(self):
892         """Report attempt to confirm age."""
893         self.to_screen('Confirming age')
894
895     def report_login(self):
896         """Report attempt to log in."""
897         self.to_screen('Logging in')
898
899     @staticmethod
900     def raise_login_required(msg='This video is only available for registered users'):
901         raise ExtractorError(
902             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
903             expected=True)
904
905     @staticmethod
906     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
907         raise GeoRestrictedError(msg, countries=countries)
908
909     # Methods for following #608
910     @staticmethod
911     def url_result(url, ie=None, video_id=None, video_title=None):
912         """Returns a URL that points to a page that should be processed"""
913         # TODO: ie should be the class used for getting the info
914         video_info = {'_type': 'url',
915                       'url': url,
916                       'ie_key': ie}
917         if video_id is not None:
918             video_info['id'] = video_id
919         if video_title is not None:
920             video_info['title'] = video_title
921         return video_info
922
923     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
924         urls = orderedSet(
925             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
926             for m in matches)
927         return self.playlist_result(
928             urls, playlist_id=playlist_id, playlist_title=playlist_title)
929
930     @staticmethod
931     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
932         """Returns a playlist"""
933         video_info = {'_type': 'playlist',
934                       'entries': entries}
935         if playlist_id:
936             video_info['id'] = playlist_id
937         if playlist_title:
938             video_info['title'] = playlist_title
939         if playlist_description:
940             video_info['description'] = playlist_description
941         return video_info
942
943     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
944         """
945         Perform a regex search on the given string, using a single or a list of
946         patterns returning the first matching group.
947         In case of failure return a default value or raise a WARNING or a
948         RegexNotFoundError, depending on fatal, specifying the field name.
949         """
950         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
951             mobj = re.search(pattern, string, flags)
952         else:
953             for p in pattern:
954                 mobj = re.search(p, string, flags)
955                 if mobj:
956                     break
957
958         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
959             _name = '\033[0;34m%s\033[0m' % name
960         else:
961             _name = name
962
963         if mobj:
964             if group is None:
965                 # return the first matching group
966                 return next(g for g in mobj.groups() if g is not None)
967             else:
968                 return mobj.group(group)
969         elif default is not NO_DEFAULT:
970             return default
971         elif fatal:
972             raise RegexNotFoundError('Unable to extract %s' % _name)
973         else:
974             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
975             return None
976
977     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
978         """
979         Like _search_regex, but strips HTML tags and unescapes entities.
980         """
981         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
982         if res:
983             return clean_html(res).strip()
984         else:
985             return res
986
987     def _get_netrc_login_info(self, netrc_machine=None):
988         username = None
989         password = None
990         netrc_machine = netrc_machine or self._NETRC_MACHINE
991
992         if self._downloader.params.get('usenetrc', False):
993             try:
994                 info = netrc.netrc().authenticators(netrc_machine)
995                 if info is not None:
996                     username = info[0]
997                     password = info[2]
998                 else:
999                     raise netrc.NetrcParseError(
1000                         'No authenticators for %s' % netrc_machine)
1001             except (IOError, netrc.NetrcParseError) as err:
1002                 self._downloader.report_warning(
1003                     'parsing .netrc: %s' % error_to_compat_str(err))
1004
1005         return username, password
1006
1007     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1008         """
1009         Get the login info as (username, password)
1010         First look for the manually specified credentials using username_option
1011         and password_option as keys in params dictionary. If no such credentials
1012         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1013         value.
1014         If there's no info available, return (None, None)
1015         """
1016         if self._downloader is None:
1017             return (None, None)
1018
1019         downloader_params = self._downloader.params
1020
1021         # Attempt to use provided username and password or .netrc data
1022         if downloader_params.get(username_option) is not None:
1023             username = downloader_params[username_option]
1024             password = downloader_params[password_option]
1025         else:
1026             username, password = self._get_netrc_login_info(netrc_machine)
1027
1028         return username, password
1029
1030     def _get_tfa_info(self, note='two-factor verification code'):
1031         """
1032         Get the two-factor authentication info
1033         TODO - asking the user will be required for sms/phone verify
1034         currently just uses the command line option
1035         If there's no info available, return None
1036         """
1037         if self._downloader is None:
1038             return None
1039         downloader_params = self._downloader.params
1040
1041         if downloader_params.get('twofactor') is not None:
1042             return downloader_params['twofactor']
1043
1044         return compat_getpass('Type %s and press [Return]: ' % note)
1045
1046     # Helper functions for extracting OpenGraph info
1047     @staticmethod
1048     def _og_regexes(prop):
1049         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1050         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1051                        % {'prop': re.escape(prop)})
1052         template = r'<meta[^>]+?%s[^>]+?%s'
1053         return [
1054             template % (property_re, content_re),
1055             template % (content_re, property_re),
1056         ]
1057
1058     @staticmethod
1059     def _meta_regex(prop):
1060         return r'''(?isx)<meta
1061                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1062                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1063
1064     def _og_search_property(self, prop, html, name=None, **kargs):
1065         if not isinstance(prop, (list, tuple)):
1066             prop = [prop]
1067         if name is None:
1068             name = 'OpenGraph %s' % prop[0]
1069         og_regexes = []
1070         for p in prop:
1071             og_regexes.extend(self._og_regexes(p))
1072         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1073         if escaped is None:
1074             return None
1075         return unescapeHTML(escaped)
1076
1077     def _og_search_thumbnail(self, html, **kargs):
1078         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1079
1080     def _og_search_description(self, html, **kargs):
1081         return self._og_search_property('description', html, fatal=False, **kargs)
1082
1083     def _og_search_title(self, html, **kargs):
1084         return self._og_search_property('title', html, **kargs)
1085
1086     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1087         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1088         if secure:
1089             regexes = self._og_regexes('video:secure_url') + regexes
1090         return self._html_search_regex(regexes, html, name, **kargs)
1091
1092     def _og_search_url(self, html, **kargs):
1093         return self._og_search_property('url', html, **kargs)
1094
1095     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1096         if not isinstance(name, (list, tuple)):
1097             name = [name]
1098         if display_name is None:
1099             display_name = name[0]
1100         return self._html_search_regex(
1101             [self._meta_regex(n) for n in name],
1102             html, display_name, fatal=fatal, group='content', **kwargs)
1103
1104     def _dc_search_uploader(self, html):
1105         return self._html_search_meta('dc.creator', html, 'uploader')
1106
1107     def _rta_search(self, html):
1108         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1109         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1110                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1111                      html):
1112             return 18
1113         return 0
1114
1115     def _media_rating_search(self, html):
1116         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1117         rating = self._html_search_meta('rating', html)
1118
1119         if not rating:
1120             return None
1121
1122         RATING_TABLE = {
1123             'safe for kids': 0,
1124             'general': 8,
1125             '14 years': 14,
1126             'mature': 17,
1127             'restricted': 19,
1128         }
1129         return RATING_TABLE.get(rating.lower())
1130
1131     def _family_friendly_search(self, html):
1132         # See http://schema.org/VideoObject
1133         family_friendly = self._html_search_meta(
1134             'isFamilyFriendly', html, default=None)
1135
1136         if not family_friendly:
1137             return None
1138
1139         RATING_TABLE = {
1140             '1': 0,
1141             'true': 0,
1142             '0': 18,
1143             'false': 18,
1144         }
1145         return RATING_TABLE.get(family_friendly.lower())
1146
1147     def _twitter_search_player(self, html):
1148         return self._html_search_meta('twitter:player', html,
1149                                       'twitter card player')
1150
1151     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1152         json_ld = self._search_regex(
1153             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1154         default = kwargs.get('default', NO_DEFAULT)
1155         if not json_ld:
1156             return default if default is not NO_DEFAULT else {}
1157         # JSON-LD may be malformed and thus `fatal` should be respected.
1158         # At the same time `default` may be passed that assumes `fatal=False`
1159         # for _search_regex. Let's simulate the same behavior here as well.
1160         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1161         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1162
1163     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1164         if isinstance(json_ld, compat_str):
1165             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1166         if not json_ld:
1167             return {}
1168         info = {}
1169         if not isinstance(json_ld, (list, tuple, dict)):
1170             return info
1171         if isinstance(json_ld, dict):
1172             json_ld = [json_ld]
1173
1174         INTERACTION_TYPE_MAP = {
1175             'CommentAction': 'comment',
1176             'AgreeAction': 'like',
1177             'DisagreeAction': 'dislike',
1178             'LikeAction': 'like',
1179             'DislikeAction': 'dislike',
1180             'ListenAction': 'view',
1181             'WatchAction': 'view',
1182             'ViewAction': 'view',
1183         }
1184
1185         def extract_interaction_statistic(e):
1186             interaction_statistic = e.get('interactionStatistic')
1187             if not isinstance(interaction_statistic, list):
1188                 return
1189             for is_e in interaction_statistic:
1190                 if not isinstance(is_e, dict):
1191                     continue
1192                 if is_e.get('@type') != 'InteractionCounter':
1193                     continue
1194                 interaction_type = is_e.get('interactionType')
1195                 if not isinstance(interaction_type, compat_str):
1196                     continue
1197                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1198                 if interaction_count is None:
1199                     continue
1200                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1201                 if not count_kind:
1202                     continue
1203                 count_key = '%s_count' % count_kind
1204                 if info.get(count_key) is not None:
1205                     continue
1206                 info[count_key] = interaction_count
1207
1208         def extract_video_object(e):
1209             assert e['@type'] == 'VideoObject'
1210             info.update({
1211                 'url': e.get('contentUrl'),
1212                 'title': unescapeHTML(e.get('name')),
1213                 'description': unescapeHTML(e.get('description')),
1214                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1215                 'duration': parse_duration(e.get('duration')),
1216                 'timestamp': unified_timestamp(e.get('uploadDate')),
1217                 'filesize': float_or_none(e.get('contentSize')),
1218                 'tbr': int_or_none(e.get('bitrate')),
1219                 'width': int_or_none(e.get('width')),
1220                 'height': int_or_none(e.get('height')),
1221                 'view_count': int_or_none(e.get('interactionCount')),
1222             })
1223             extract_interaction_statistic(e)
1224
1225         for e in json_ld:
1226             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1227                 item_type = e.get('@type')
1228                 if expected_type is not None and expected_type != item_type:
1229                     return info
1230                 if item_type in ('TVEpisode', 'Episode'):
1231                     info.update({
1232                         'episode': unescapeHTML(e.get('name')),
1233                         'episode_number': int_or_none(e.get('episodeNumber')),
1234                         'description': unescapeHTML(e.get('description')),
1235                     })
1236                     part_of_season = e.get('partOfSeason')
1237                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1238                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1239                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1240                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1241                         info['series'] = unescapeHTML(part_of_series.get('name'))
1242                 elif item_type in ('Article', 'NewsArticle'):
1243                     info.update({
1244                         'timestamp': parse_iso8601(e.get('datePublished')),
1245                         'title': unescapeHTML(e.get('headline')),
1246                         'description': unescapeHTML(e.get('articleBody')),
1247                     })
1248                 elif item_type == 'VideoObject':
1249                     extract_video_object(e)
1250                     continue
1251                 video = e.get('video')
1252                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1253                     extract_video_object(video)
1254                 break
1255         return dict((k, v) for k, v in info.items() if v is not None)
1256
1257     @staticmethod
1258     def _hidden_inputs(html):
1259         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1260         hidden_inputs = {}
1261         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1262             attrs = extract_attributes(input)
1263             if not input:
1264                 continue
1265             if attrs.get('type') not in ('hidden', 'submit'):
1266                 continue
1267             name = attrs.get('name') or attrs.get('id')
1268             value = attrs.get('value')
1269             if name and value is not None:
1270                 hidden_inputs[name] = value
1271         return hidden_inputs
1272
1273     def _form_hidden_inputs(self, form_id, html):
1274         form = self._search_regex(
1275             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1276             html, '%s form' % form_id, group='form')
1277         return self._hidden_inputs(form)
1278
1279     def _sort_formats(self, formats, field_preference=None):
1280         if not formats:
1281             raise ExtractorError('No video formats found')
1282
1283         for f in formats:
1284             # Automatically determine tbr when missing based on abr and vbr (improves
1285             # formats sorting in some cases)
1286             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1287                 f['tbr'] = f['abr'] + f['vbr']
1288
1289         def _formats_key(f):
1290             # TODO remove the following workaround
1291             from ..utils import determine_ext
1292             if not f.get('ext') and 'url' in f:
1293                 f['ext'] = determine_ext(f['url'])
1294
1295             if isinstance(field_preference, (list, tuple)):
1296                 return tuple(
1297                     f.get(field)
1298                     if f.get(field) is not None
1299                     else ('' if field == 'format_id' else -1)
1300                     for field in field_preference)
1301
1302             preference = f.get('preference')
1303             if preference is None:
1304                 preference = 0
1305                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1306                     preference -= 0.5
1307
1308             protocol = f.get('protocol') or determine_protocol(f)
1309             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1310
1311             if f.get('vcodec') == 'none':  # audio only
1312                 preference -= 50
1313                 if self._downloader.params.get('prefer_free_formats'):
1314                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1315                 else:
1316                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1317                 ext_preference = 0
1318                 try:
1319                     audio_ext_preference = ORDER.index(f['ext'])
1320                 except ValueError:
1321                     audio_ext_preference = -1
1322             else:
1323                 if f.get('acodec') == 'none':  # video only
1324                     preference -= 40
1325                 if self._downloader.params.get('prefer_free_formats'):
1326                     ORDER = ['flv', 'mp4', 'webm']
1327                 else:
1328                     ORDER = ['webm', 'flv', 'mp4']
1329                 try:
1330                     ext_preference = ORDER.index(f['ext'])
1331                 except ValueError:
1332                     ext_preference = -1
1333                 audio_ext_preference = 0
1334
1335             return (
1336                 preference,
1337                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1338                 f.get('quality') if f.get('quality') is not None else -1,
1339                 f.get('tbr') if f.get('tbr') is not None else -1,
1340                 f.get('filesize') if f.get('filesize') is not None else -1,
1341                 f.get('vbr') if f.get('vbr') is not None else -1,
1342                 f.get('height') if f.get('height') is not None else -1,
1343                 f.get('width') if f.get('width') is not None else -1,
1344                 proto_preference,
1345                 ext_preference,
1346                 f.get('abr') if f.get('abr') is not None else -1,
1347                 audio_ext_preference,
1348                 f.get('fps') if f.get('fps') is not None else -1,
1349                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1350                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1351                 f.get('format_id') if f.get('format_id') is not None else '',
1352             )
1353         formats.sort(key=_formats_key)
1354
1355     def _check_formats(self, formats, video_id):
1356         if formats:
1357             formats[:] = filter(
1358                 lambda f: self._is_valid_url(
1359                     f['url'], video_id,
1360                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1361                 formats)
1362
1363     @staticmethod
1364     def _remove_duplicate_formats(formats):
1365         format_urls = set()
1366         unique_formats = []
1367         for f in formats:
1368             if f['url'] not in format_urls:
1369                 format_urls.add(f['url'])
1370                 unique_formats.append(f)
1371         formats[:] = unique_formats
1372
1373     def _is_valid_url(self, url, video_id, item='video', headers={}):
1374         url = self._proto_relative_url(url, scheme='http:')
1375         # For now assume non HTTP(S) URLs always valid
1376         if not (url.startswith('http://') or url.startswith('https://')):
1377             return True
1378         try:
1379             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1380             return True
1381         except ExtractorError as e:
1382             if isinstance(e.cause, compat_urllib_error.URLError):
1383                 self.to_screen(
1384                     '%s: %s URL is invalid, skipping' % (video_id, item))
1385                 return False
1386             raise
1387
1388     def http_scheme(self):
1389         """ Either "http:" or "https:", depending on the user's preferences """
1390         return (
1391             'http:'
1392             if self._downloader.params.get('prefer_insecure', False)
1393             else 'https:')
1394
1395     def _proto_relative_url(self, url, scheme=None):
1396         if url is None:
1397             return url
1398         if url.startswith('//'):
1399             if scheme is None:
1400                 scheme = self.http_scheme()
1401             return scheme + url
1402         else:
1403             return url
1404
1405     def _sleep(self, timeout, video_id, msg_template=None):
1406         if msg_template is None:
1407             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1408         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1409         self.to_screen(msg)
1410         time.sleep(timeout)
1411
1412     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1413                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1414                              fatal=True, m3u8_id=None):
1415         manifest = self._download_xml(
1416             manifest_url, video_id, 'Downloading f4m manifest',
1417             'Unable to download f4m manifest',
1418             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1419             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1420             transform_source=transform_source,
1421             fatal=fatal)
1422
1423         if manifest is False:
1424             return []
1425
1426         return self._parse_f4m_formats(
1427             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1428             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1429
1430     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1431                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1432                            fatal=True, m3u8_id=None):
1433         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1434         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1435         if akamai_pv is not None and ';' in akamai_pv.text:
1436             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1437             if playerVerificationChallenge.strip() != '':
1438                 return []
1439
1440         formats = []
1441         manifest_version = '1.0'
1442         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1443         if not media_nodes:
1444             manifest_version = '2.0'
1445             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1446         # Remove unsupported DRM protected media from final formats
1447         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1448         media_nodes = remove_encrypted_media(media_nodes)
1449         if not media_nodes:
1450             return formats
1451
1452         manifest_base_url = get_base_url(manifest)
1453
1454         bootstrap_info = xpath_element(
1455             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1456             'bootstrap info', default=None)
1457
1458         vcodec = None
1459         mime_type = xpath_text(
1460             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1461             'base URL', default=None)
1462         if mime_type and mime_type.startswith('audio/'):
1463             vcodec = 'none'
1464
1465         for i, media_el in enumerate(media_nodes):
1466             tbr = int_or_none(media_el.attrib.get('bitrate'))
1467             width = int_or_none(media_el.attrib.get('width'))
1468             height = int_or_none(media_el.attrib.get('height'))
1469             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1470             # If <bootstrapInfo> is present, the specified f4m is a
1471             # stream-level manifest, and only set-level manifests may refer to
1472             # external resources.  See section 11.4 and section 4 of F4M spec
1473             if bootstrap_info is None:
1474                 media_url = None
1475                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1476                 if manifest_version == '2.0':
1477                     media_url = media_el.attrib.get('href')
1478                 if media_url is None:
1479                     media_url = media_el.attrib.get('url')
1480                 if not media_url:
1481                     continue
1482                 manifest_url = (
1483                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1484                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1485                 # If media_url is itself a f4m manifest do the recursive extraction
1486                 # since bitrates in parent manifest (this one) and media_url manifest
1487                 # may differ leading to inability to resolve the format by requested
1488                 # bitrate in f4m downloader
1489                 ext = determine_ext(manifest_url)
1490                 if ext == 'f4m':
1491                     f4m_formats = self._extract_f4m_formats(
1492                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1493                         transform_source=transform_source, fatal=fatal)
1494                     # Sometimes stream-level manifest contains single media entry that
1495                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1496                     # At the same time parent's media entry in set-level manifest may
1497                     # contain it. We will copy it from parent in such cases.
1498                     if len(f4m_formats) == 1:
1499                         f = f4m_formats[0]
1500                         f.update({
1501                             'tbr': f.get('tbr') or tbr,
1502                             'width': f.get('width') or width,
1503                             'height': f.get('height') or height,
1504                             'format_id': f.get('format_id') if not tbr else format_id,
1505                             'vcodec': vcodec,
1506                         })
1507                     formats.extend(f4m_formats)
1508                     continue
1509                 elif ext == 'm3u8':
1510                     formats.extend(self._extract_m3u8_formats(
1511                         manifest_url, video_id, 'mp4', preference=preference,
1512                         m3u8_id=m3u8_id, fatal=fatal))
1513                     continue
1514             formats.append({
1515                 'format_id': format_id,
1516                 'url': manifest_url,
1517                 'manifest_url': manifest_url,
1518                 'ext': 'flv' if bootstrap_info is not None else None,
1519                 'protocol': 'f4m',
1520                 'tbr': tbr,
1521                 'width': width,
1522                 'height': height,
1523                 'vcodec': vcodec,
1524                 'preference': preference,
1525             })
1526         return formats
1527
1528     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1529         return {
1530             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1531             'url': m3u8_url,
1532             'ext': ext,
1533             'protocol': 'm3u8',
1534             'preference': preference - 100 if preference else -100,
1535             'resolution': 'multiple',
1536             'format_note': 'Quality selection URL',
1537         }
1538
1539     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1540                               entry_protocol='m3u8', preference=None,
1541                               m3u8_id=None, note=None, errnote=None,
1542                               fatal=True, live=False):
1543         res = self._download_webpage_handle(
1544             m3u8_url, video_id,
1545             note=note or 'Downloading m3u8 information',
1546             errnote=errnote or 'Failed to download m3u8 information',
1547             fatal=fatal)
1548
1549         if res is False:
1550             return []
1551
1552         m3u8_doc, urlh = res
1553         m3u8_url = urlh.geturl()
1554
1555         return self._parse_m3u8_formats(
1556             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1557             preference=preference, m3u8_id=m3u8_id, live=live)
1558
1559     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1560                             entry_protocol='m3u8', preference=None,
1561                             m3u8_id=None, live=False):
1562         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1563             return []
1564
1565         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1566             return []
1567
1568         formats = []
1569
1570         format_url = lambda u: (
1571             u
1572             if re.match(r'^https?://', u)
1573             else compat_urlparse.urljoin(m3u8_url, u))
1574
1575         # References:
1576         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1577         # 2. https://github.com/rg3/youtube-dl/issues/12211
1578
1579         # We should try extracting formats only from master playlists [1, 4.3.4],
1580         # i.e. playlists that describe available qualities. On the other hand
1581         # media playlists [1, 4.3.3] should be returned as is since they contain
1582         # just the media without qualities renditions.
1583         # Fortunately, master playlist can be easily distinguished from media
1584         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1585         # master playlist tags MUST NOT appear in a media playist and vice versa.
1586         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1587         # media playlist and MUST NOT appear in master playlist thus we can
1588         # clearly detect media playlist with this criterion.
1589
1590         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1591             return [{
1592                 'url': m3u8_url,
1593                 'format_id': m3u8_id,
1594                 'ext': ext,
1595                 'protocol': entry_protocol,
1596                 'preference': preference,
1597             }]
1598
1599         groups = {}
1600         last_stream_inf = {}
1601
1602         def extract_media(x_media_line):
1603             media = parse_m3u8_attributes(x_media_line)
1604             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1605             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1606             if not (media_type and group_id and name):
1607                 return
1608             groups.setdefault(group_id, []).append(media)
1609             if media_type not in ('VIDEO', 'AUDIO'):
1610                 return
1611             media_url = media.get('URI')
1612             if media_url:
1613                 format_id = []
1614                 for v in (m3u8_id, group_id, name):
1615                     if v:
1616                         format_id.append(v)
1617                 f = {
1618                     'format_id': '-'.join(format_id),
1619                     'url': format_url(media_url),
1620                     'manifest_url': m3u8_url,
1621                     'language': media.get('LANGUAGE'),
1622                     'ext': ext,
1623                     'protocol': entry_protocol,
1624                     'preference': preference,
1625                 }
1626                 if media_type == 'AUDIO':
1627                     f['vcodec'] = 'none'
1628                 formats.append(f)
1629
1630         def build_stream_name():
1631             # Despite specification does not mention NAME attribute for
1632             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1633             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1634             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1635             stream_name = last_stream_inf.get('NAME')
1636             if stream_name:
1637                 return stream_name
1638             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1639             # from corresponding rendition group
1640             stream_group_id = last_stream_inf.get('VIDEO')
1641             if not stream_group_id:
1642                 return
1643             stream_group = groups.get(stream_group_id)
1644             if not stream_group:
1645                 return stream_group_id
1646             rendition = stream_group[0]
1647             return rendition.get('NAME') or stream_group_id
1648
1649         for line in m3u8_doc.splitlines():
1650             if line.startswith('#EXT-X-STREAM-INF:'):
1651                 last_stream_inf = parse_m3u8_attributes(line)
1652             elif line.startswith('#EXT-X-MEDIA:'):
1653                 extract_media(line)
1654             elif line.startswith('#') or not line.strip():
1655                 continue
1656             else:
1657                 tbr = float_or_none(
1658                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1659                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1660                 format_id = []
1661                 if m3u8_id:
1662                     format_id.append(m3u8_id)
1663                 stream_name = build_stream_name()
1664                 # Bandwidth of live streams may differ over time thus making
1665                 # format_id unpredictable. So it's better to keep provided
1666                 # format_id intact.
1667                 if not live:
1668                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1669                 manifest_url = format_url(line.strip())
1670                 f = {
1671                     'format_id': '-'.join(format_id),
1672                     'url': manifest_url,
1673                     'manifest_url': m3u8_url,
1674                     'tbr': tbr,
1675                     'ext': ext,
1676                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1677                     'protocol': entry_protocol,
1678                     'preference': preference,
1679                 }
1680                 resolution = last_stream_inf.get('RESOLUTION')
1681                 if resolution:
1682                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1683                     if mobj:
1684                         f['width'] = int(mobj.group('width'))
1685                         f['height'] = int(mobj.group('height'))
1686                 # Unified Streaming Platform
1687                 mobj = re.search(
1688                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1689                 if mobj:
1690                     abr, vbr = mobj.groups()
1691                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1692                     f.update({
1693                         'vbr': vbr,
1694                         'abr': abr,
1695                     })
1696                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1697                 f.update(codecs)
1698                 audio_group_id = last_stream_inf.get('AUDIO')
1699                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1700                 # references a rendition group MUST have a CODECS attribute.
1701                 # However, this is not always respected, for example, [2]
1702                 # contains EXT-X-STREAM-INF tag which references AUDIO
1703                 # rendition group but does not have CODECS and despite
1704                 # referencing audio group an audio group, it represents
1705                 # a complete (with audio and video) format. So, for such cases
1706                 # we will ignore references to rendition groups and treat them
1707                 # as complete formats.
1708                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1709                     audio_group = groups.get(audio_group_id)
1710                     if audio_group and audio_group[0].get('URI'):
1711                         # TODO: update acodec for audio only formats with
1712                         # the same GROUP-ID
1713                         f['acodec'] = 'none'
1714                 formats.append(f)
1715                 last_stream_inf = {}
1716         return formats
1717
1718     @staticmethod
1719     def _xpath_ns(path, namespace=None):
1720         if not namespace:
1721             return path
1722         out = []
1723         for c in path.split('/'):
1724             if not c or c == '.':
1725                 out.append(c)
1726             else:
1727                 out.append('{%s}%s' % (namespace, c))
1728         return '/'.join(out)
1729
1730     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1731         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1732
1733         if smil is False:
1734             assert not fatal
1735             return []
1736
1737         namespace = self._parse_smil_namespace(smil)
1738
1739         return self._parse_smil_formats(
1740             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1741
1742     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1743         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1744         if smil is False:
1745             return {}
1746         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1747
1748     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1749         return self._download_xml(
1750             smil_url, video_id, 'Downloading SMIL file',
1751             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1752
1753     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1754         namespace = self._parse_smil_namespace(smil)
1755
1756         formats = self._parse_smil_formats(
1757             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1758         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1759
1760         video_id = os.path.splitext(url_basename(smil_url))[0]
1761         title = None
1762         description = None
1763         upload_date = None
1764         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1765             name = meta.attrib.get('name')
1766             content = meta.attrib.get('content')
1767             if not name or not content:
1768                 continue
1769             if not title and name == 'title':
1770                 title = content
1771             elif not description and name in ('description', 'abstract'):
1772                 description = content
1773             elif not upload_date and name == 'date':
1774                 upload_date = unified_strdate(content)
1775
1776         thumbnails = [{
1777             'id': image.get('type'),
1778             'url': image.get('src'),
1779             'width': int_or_none(image.get('width')),
1780             'height': int_or_none(image.get('height')),
1781         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1782
1783         return {
1784             'id': video_id,
1785             'title': title or video_id,
1786             'description': description,
1787             'upload_date': upload_date,
1788             'thumbnails': thumbnails,
1789             'formats': formats,
1790             'subtitles': subtitles,
1791         }
1792
1793     def _parse_smil_namespace(self, smil):
1794         return self._search_regex(
1795             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1796
1797     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1798         base = smil_url
1799         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1800             b = meta.get('base') or meta.get('httpBase')
1801             if b:
1802                 base = b
1803                 break
1804
1805         formats = []
1806         rtmp_count = 0
1807         http_count = 0
1808         m3u8_count = 0
1809
1810         srcs = []
1811         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1812         for medium in media:
1813             src = medium.get('src')
1814             if not src or src in srcs:
1815                 continue
1816             srcs.append(src)
1817
1818             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1819             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1820             width = int_or_none(medium.get('width'))
1821             height = int_or_none(medium.get('height'))
1822             proto = medium.get('proto')
1823             ext = medium.get('ext')
1824             src_ext = determine_ext(src)
1825             streamer = medium.get('streamer') or base
1826
1827             if proto == 'rtmp' or streamer.startswith('rtmp'):
1828                 rtmp_count += 1
1829                 formats.append({
1830                     'url': streamer,
1831                     'play_path': src,
1832                     'ext': 'flv',
1833                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1834                     'tbr': bitrate,
1835                     'filesize': filesize,
1836                     'width': width,
1837                     'height': height,
1838                 })
1839                 if transform_rtmp_url:
1840                     streamer, src = transform_rtmp_url(streamer, src)
1841                     formats[-1].update({
1842                         'url': streamer,
1843                         'play_path': src,
1844                     })
1845                 continue
1846
1847             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1848             src_url = src_url.strip()
1849
1850             if proto == 'm3u8' or src_ext == 'm3u8':
1851                 m3u8_formats = self._extract_m3u8_formats(
1852                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1853                 if len(m3u8_formats) == 1:
1854                     m3u8_count += 1
1855                     m3u8_formats[0].update({
1856                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1857                         'tbr': bitrate,
1858                         'width': width,
1859                         'height': height,
1860                     })
1861                 formats.extend(m3u8_formats)
1862             elif src_ext == 'f4m':
1863                 f4m_url = src_url
1864                 if not f4m_params:
1865                     f4m_params = {
1866                         'hdcore': '3.2.0',
1867                         'plugin': 'flowplayer-3.2.0.1',
1868                     }
1869                 f4m_url += '&' if '?' in f4m_url else '?'
1870                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1871                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1872             elif src_ext == 'mpd':
1873                 formats.extend(self._extract_mpd_formats(
1874                     src_url, video_id, mpd_id='dash', fatal=False))
1875             elif re.search(r'\.ism/[Mm]anifest', src_url):
1876                 formats.extend(self._extract_ism_formats(
1877                     src_url, video_id, ism_id='mss', fatal=False))
1878             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1879                 http_count += 1
1880                 formats.append({
1881                     'url': src_url,
1882                     'ext': ext or src_ext or 'flv',
1883                     'format_id': 'http-%d' % (bitrate or http_count),
1884                     'tbr': bitrate,
1885                     'filesize': filesize,
1886                     'width': width,
1887                     'height': height,
1888                 })
1889
1890         return formats
1891
1892     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1893         urls = []
1894         subtitles = {}
1895         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1896             src = textstream.get('src')
1897             if not src or src in urls:
1898                 continue
1899             urls.append(src)
1900             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1901             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1902             subtitles.setdefault(lang, []).append({
1903                 'url': src,
1904                 'ext': ext,
1905             })
1906         return subtitles
1907
1908     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1909         xspf = self._download_xml(
1910             xspf_url, playlist_id, 'Downloading xpsf playlist',
1911             'Unable to download xspf manifest', fatal=fatal)
1912         if xspf is False:
1913             return []
1914         return self._parse_xspf(
1915             xspf, playlist_id, xspf_url=xspf_url,
1916             xspf_base_url=base_url(xspf_url))
1917
1918     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1919         NS_MAP = {
1920             'xspf': 'http://xspf.org/ns/0/',
1921             's1': 'http://static.streamone.nl/player/ns/0',
1922         }
1923
1924         entries = []
1925         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1926             title = xpath_text(
1927                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1928             description = xpath_text(
1929                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1930             thumbnail = xpath_text(
1931                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1932             duration = float_or_none(
1933                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1934
1935             formats = []
1936             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1937                 format_url = urljoin(xspf_base_url, location.text)
1938                 if not format_url:
1939                     continue
1940                 formats.append({
1941                     'url': format_url,
1942                     'manifest_url': xspf_url,
1943                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1944                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1945                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1946                 })
1947             self._sort_formats(formats)
1948
1949             entries.append({
1950                 'id': playlist_id,
1951                 'title': title,
1952                 'description': description,
1953                 'thumbnail': thumbnail,
1954                 'duration': duration,
1955                 'formats': formats,
1956             })
1957         return entries
1958
1959     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1960         res = self._download_xml_handle(
1961             mpd_url, video_id,
1962             note=note or 'Downloading MPD manifest',
1963             errnote=errnote or 'Failed to download MPD manifest',
1964             fatal=fatal)
1965         if res is False:
1966             return []
1967         mpd_doc, urlh = res
1968         mpd_base_url = base_url(urlh.geturl())
1969
1970         return self._parse_mpd_formats(
1971             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1972             formats_dict=formats_dict, mpd_url=mpd_url)
1973
1974     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1975         """
1976         Parse formats from MPD manifest.
1977         References:
1978          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1979             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1980          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1981         """
1982         if mpd_doc.get('type') == 'dynamic':
1983             return []
1984
1985         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1986
1987         def _add_ns(path):
1988             return self._xpath_ns(path, namespace)
1989
1990         def is_drm_protected(element):
1991             return element.find(_add_ns('ContentProtection')) is not None
1992
1993         def extract_multisegment_info(element, ms_parent_info):
1994             ms_info = ms_parent_info.copy()
1995
1996             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1997             # common attributes and elements.  We will only extract relevant
1998             # for us.
1999             def extract_common(source):
2000                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2001                 if segment_timeline is not None:
2002                     s_e = segment_timeline.findall(_add_ns('S'))
2003                     if s_e:
2004                         ms_info['total_number'] = 0
2005                         ms_info['s'] = []
2006                         for s in s_e:
2007                             r = int(s.get('r', 0))
2008                             ms_info['total_number'] += 1 + r
2009                             ms_info['s'].append({
2010                                 't': int(s.get('t', 0)),
2011                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2012                                 'd': int(s.attrib['d']),
2013                                 'r': r,
2014                             })
2015                 start_number = source.get('startNumber')
2016                 if start_number:
2017                     ms_info['start_number'] = int(start_number)
2018                 timescale = source.get('timescale')
2019                 if timescale:
2020                     ms_info['timescale'] = int(timescale)
2021                 segment_duration = source.get('duration')
2022                 if segment_duration:
2023                     ms_info['segment_duration'] = float(segment_duration)
2024
2025             def extract_Initialization(source):
2026                 initialization = source.find(_add_ns('Initialization'))
2027                 if initialization is not None:
2028                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2029
2030             segment_list = element.find(_add_ns('SegmentList'))
2031             if segment_list is not None:
2032                 extract_common(segment_list)
2033                 extract_Initialization(segment_list)
2034                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2035                 if segment_urls_e:
2036                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2037             else:
2038                 segment_template = element.find(_add_ns('SegmentTemplate'))
2039                 if segment_template is not None:
2040                     extract_common(segment_template)
2041                     media = segment_template.get('media')
2042                     if media:
2043                         ms_info['media'] = media
2044                     initialization = segment_template.get('initialization')
2045                     if initialization:
2046                         ms_info['initialization'] = initialization
2047                     else:
2048                         extract_Initialization(segment_template)
2049             return ms_info
2050
2051         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2052         formats = []
2053         for period in mpd_doc.findall(_add_ns('Period')):
2054             period_duration = parse_duration(period.get('duration')) or mpd_duration
2055             period_ms_info = extract_multisegment_info(period, {
2056                 'start_number': 1,
2057                 'timescale': 1,
2058             })
2059             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2060                 if is_drm_protected(adaptation_set):
2061                     continue
2062                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2063                 for representation in adaptation_set.findall(_add_ns('Representation')):
2064                     if is_drm_protected(representation):
2065                         continue
2066                     representation_attrib = adaptation_set.attrib.copy()
2067                     representation_attrib.update(representation.attrib)
2068                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2069                     mime_type = representation_attrib['mimeType']
2070                     content_type = mime_type.split('/')[0]
2071                     if content_type == 'text':
2072                         # TODO implement WebVTT downloading
2073                         pass
2074                     elif content_type in ('video', 'audio'):
2075                         base_url = ''
2076                         for element in (representation, adaptation_set, period, mpd_doc):
2077                             base_url_e = element.find(_add_ns('BaseURL'))
2078                             if base_url_e is not None:
2079                                 base_url = base_url_e.text + base_url
2080                                 if re.match(r'^https?://', base_url):
2081                                     break
2082                         if mpd_base_url and not re.match(r'^https?://', base_url):
2083                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2084                                 mpd_base_url += '/'
2085                             base_url = mpd_base_url + base_url
2086                         representation_id = representation_attrib.get('id')
2087                         lang = representation_attrib.get('lang')
2088                         url_el = representation.find(_add_ns('BaseURL'))
2089                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2090                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2091                         f = {
2092                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2093                             'url': base_url,
2094                             'manifest_url': mpd_url,
2095                             'ext': mimetype2ext(mime_type),
2096                             'width': int_or_none(representation_attrib.get('width')),
2097                             'height': int_or_none(representation_attrib.get('height')),
2098                             'tbr': float_or_none(bandwidth, 1000),
2099                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2100                             'fps': int_or_none(representation_attrib.get('frameRate')),
2101                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2102                             'format_note': 'DASH %s' % content_type,
2103                             'filesize': filesize,
2104                             'container': mimetype2ext(mime_type) + '_dash',
2105                         }
2106                         f.update(parse_codecs(representation_attrib.get('codecs')))
2107                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2108
2109                         def prepare_template(template_name, identifiers):
2110                             tmpl = representation_ms_info[template_name]
2111                             # First of, % characters outside $...$ templates
2112                             # must be escaped by doubling for proper processing
2113                             # by % operator string formatting used further (see
2114                             # https://github.com/rg3/youtube-dl/issues/16867).
2115                             t = ''
2116                             in_template = False
2117                             for c in tmpl:
2118                                 t += c
2119                                 if c == '$':
2120                                     in_template = not in_template
2121                                 elif c == '%' and not in_template:
2122                                     t += c
2123                             # Next, $...$ templates are translated to their
2124                             # %(...) counterparts to be used with % operator
2125                             t = t.replace('$RepresentationID$', representation_id)
2126                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2127                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2128                             t.replace('$$', '$')
2129                             return t
2130
2131                         # @initialization is a regular template like @media one
2132                         # so it should be handled just the same way (see
2133                         # https://github.com/rg3/youtube-dl/issues/11605)
2134                         if 'initialization' in representation_ms_info:
2135                             initialization_template = prepare_template(
2136                                 'initialization',
2137                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2138                                 # $Time$ shall not be included for @initialization thus
2139                                 # only $Bandwidth$ remains
2140                                 ('Bandwidth', ))
2141                             representation_ms_info['initialization_url'] = initialization_template % {
2142                                 'Bandwidth': bandwidth,
2143                             }
2144
2145                         def location_key(location):
2146                             return 'url' if re.match(r'^https?://', location) else 'path'
2147
2148                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2149
2150                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2151                             media_location_key = location_key(media_template)
2152
2153                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2154                             # can't be used at the same time
2155                             if '%(Number' in media_template and 's' not in representation_ms_info:
2156                                 segment_duration = None
2157                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2158                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2159                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2160                                 representation_ms_info['fragments'] = [{
2161                                     media_location_key: media_template % {
2162                                         'Number': segment_number,
2163                                         'Bandwidth': bandwidth,
2164                                     },
2165                                     'duration': segment_duration,
2166                                 } for segment_number in range(
2167                                     representation_ms_info['start_number'],
2168                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2169                             else:
2170                                 # $Number*$ or $Time$ in media template with S list available
2171                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2172                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2173                                 representation_ms_info['fragments'] = []
2174                                 segment_time = 0
2175                                 segment_d = None
2176                                 segment_number = representation_ms_info['start_number']
2177
2178                                 def add_segment_url():
2179                                     segment_url = media_template % {
2180                                         'Time': segment_time,
2181                                         'Bandwidth': bandwidth,
2182                                         'Number': segment_number,
2183                                     }
2184                                     representation_ms_info['fragments'].append({
2185                                         media_location_key: segment_url,
2186                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2187                                     })
2188
2189                                 for num, s in enumerate(representation_ms_info['s']):
2190                                     segment_time = s.get('t') or segment_time
2191                                     segment_d = s['d']
2192                                     add_segment_url()
2193                                     segment_number += 1
2194                                     for r in range(s.get('r', 0)):
2195                                         segment_time += segment_d
2196                                         add_segment_url()
2197                                         segment_number += 1
2198                                     segment_time += segment_d
2199                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2200                             # No media template
2201                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2202                             # or any YouTube dashsegments video
2203                             fragments = []
2204                             segment_index = 0
2205                             timescale = representation_ms_info['timescale']
2206                             for s in representation_ms_info['s']:
2207                                 duration = float_or_none(s['d'], timescale)
2208                                 for r in range(s.get('r', 0) + 1):
2209                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2210                                     fragments.append({
2211                                         location_key(segment_uri): segment_uri,
2212                                         'duration': duration,
2213                                     })
2214                                     segment_index += 1
2215                             representation_ms_info['fragments'] = fragments
2216                         elif 'segment_urls' in representation_ms_info:
2217                             # Segment URLs with no SegmentTimeline
2218                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2219                             # https://github.com/rg3/youtube-dl/pull/14844
2220                             fragments = []
2221                             segment_duration = float_or_none(
2222                                 representation_ms_info['segment_duration'],
2223                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2224                             for segment_url in representation_ms_info['segment_urls']:
2225                                 fragment = {
2226                                     location_key(segment_url): segment_url,
2227                                 }
2228                                 if segment_duration:
2229                                     fragment['duration'] = segment_duration
2230                                 fragments.append(fragment)
2231                             representation_ms_info['fragments'] = fragments
2232                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2233                         # No fragments key is present in this case.
2234                         if 'fragments' in representation_ms_info:
2235                             f.update({
2236                                 'fragment_base_url': base_url,
2237                                 'fragments': [],
2238                                 'protocol': 'http_dash_segments',
2239                             })
2240                             if 'initialization_url' in representation_ms_info:
2241                                 initialization_url = representation_ms_info['initialization_url']
2242                                 if not f.get('url'):
2243                                     f['url'] = initialization_url
2244                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2245                             f['fragments'].extend(representation_ms_info['fragments'])
2246                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2247                         # is not necessarily unique within a Period thus formats with
2248                         # the same `format_id` are quite possible. There are numerous examples
2249                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2250                         # https://github.com/rg3/youtube-dl/issues/13919)
2251                         full_info = formats_dict.get(representation_id, {}).copy()
2252                         full_info.update(f)
2253                         formats.append(full_info)
2254                     else:
2255                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2256         return formats
2257
2258     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2259         res = self._download_xml_handle(
2260             ism_url, video_id,
2261             note=note or 'Downloading ISM manifest',
2262             errnote=errnote or 'Failed to download ISM manifest',
2263             fatal=fatal)
2264         if res is False:
2265             return []
2266         ism_doc, urlh = res
2267
2268         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2269
2270     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2271         """
2272         Parse formats from ISM manifest.
2273         References:
2274          1. [MS-SSTR]: Smooth Streaming Protocol,
2275             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2276         """
2277         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2278             return []
2279
2280         duration = int(ism_doc.attrib['Duration'])
2281         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2282
2283         formats = []
2284         for stream in ism_doc.findall('StreamIndex'):
2285             stream_type = stream.get('Type')
2286             if stream_type not in ('video', 'audio'):
2287                 continue
2288             url_pattern = stream.attrib['Url']
2289             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2290             stream_name = stream.get('Name')
2291             for track in stream.findall('QualityLevel'):
2292                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2293                 # TODO: add support for WVC1 and WMAP
2294                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2295                     self.report_warning('%s is not a supported codec' % fourcc)
2296                     continue
2297                 tbr = int(track.attrib['Bitrate']) // 1000
2298                 # [1] does not mention Width and Height attributes. However,
2299                 # they're often present while MaxWidth and MaxHeight are
2300                 # missing, so should be used as fallbacks
2301                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2302                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2303                 sampling_rate = int_or_none(track.get('SamplingRate'))
2304
2305                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2306                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2307
2308                 fragments = []
2309                 fragment_ctx = {
2310                     'time': 0,
2311                 }
2312                 stream_fragments = stream.findall('c')
2313                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2314                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2315                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2316                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2317                     if not fragment_ctx['duration']:
2318                         try:
2319                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2320                         except IndexError:
2321                             next_fragment_time = duration
2322                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2323                     for _ in range(fragment_repeat):
2324                         fragments.append({
2325                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2326                             'duration': fragment_ctx['duration'] / stream_timescale,
2327                         })
2328                         fragment_ctx['time'] += fragment_ctx['duration']
2329
2330                 format_id = []
2331                 if ism_id:
2332                     format_id.append(ism_id)
2333                 if stream_name:
2334                     format_id.append(stream_name)
2335                 format_id.append(compat_str(tbr))
2336
2337                 formats.append({
2338                     'format_id': '-'.join(format_id),
2339                     'url': ism_url,
2340                     'manifest_url': ism_url,
2341                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2342                     'width': width,
2343                     'height': height,
2344                     'tbr': tbr,
2345                     'asr': sampling_rate,
2346                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2347                     'acodec': 'none' if stream_type == 'video' else fourcc,
2348                     'protocol': 'ism',
2349                     'fragments': fragments,
2350                     '_download_params': {
2351                         'duration': duration,
2352                         'timescale': stream_timescale,
2353                         'width': width or 0,
2354                         'height': height or 0,
2355                         'fourcc': fourcc,
2356                         'codec_private_data': track.get('CodecPrivateData'),
2357                         'sampling_rate': sampling_rate,
2358                         'channels': int_or_none(track.get('Channels', 2)),
2359                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2360                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2361                     },
2362                 })
2363         return formats
2364
2365     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2366         def absolute_url(item_url):
2367             return urljoin(base_url, item_url)
2368
2369         def parse_content_type(content_type):
2370             if not content_type:
2371                 return {}
2372             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2373             if ctr:
2374                 mimetype, codecs = ctr.groups()
2375                 f = parse_codecs(codecs)
2376                 f['ext'] = mimetype2ext(mimetype)
2377                 return f
2378             return {}
2379
2380         def _media_formats(src, cur_media_type, type_info={}):
2381             full_url = absolute_url(src)
2382             ext = type_info.get('ext') or determine_ext(full_url)
2383             if ext == 'm3u8':
2384                 is_plain_url = False
2385                 formats = self._extract_m3u8_formats(
2386                     full_url, video_id, ext='mp4',
2387                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2388                     preference=preference, fatal=False)
2389             elif ext == 'mpd':
2390                 is_plain_url = False
2391                 formats = self._extract_mpd_formats(
2392                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2393             else:
2394                 is_plain_url = True
2395                 formats = [{
2396                     'url': full_url,
2397                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2398                 }]
2399             return is_plain_url, formats
2400
2401         entries = []
2402         # amp-video and amp-audio are very similar to their HTML5 counterparts
2403         # so we wll include them right here (see
2404         # https://www.ampproject.org/docs/reference/components/amp-video)
2405         media_tags = [(media_tag, media_type, '')
2406                       for media_tag, media_type
2407                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2408         media_tags.extend(re.findall(
2409             # We only allow video|audio followed by a whitespace or '>'.
2410             # Allowing more characters may end up in significant slow down (see
2411             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2412             # http://www.porntrex.com/maps/videositemap.xml).
2413             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2414         for media_tag, media_type, media_content in media_tags:
2415             media_info = {
2416                 'formats': [],
2417                 'subtitles': {},
2418             }
2419             media_attributes = extract_attributes(media_tag)
2420             src = media_attributes.get('src')
2421             if src:
2422                 _, formats = _media_formats(src, media_type)
2423                 media_info['formats'].extend(formats)
2424             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2425             if media_content:
2426                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2427                     source_attributes = extract_attributes(source_tag)
2428                     src = source_attributes.get('src')
2429                     if not src:
2430                         continue
2431                     f = parse_content_type(source_attributes.get('type'))
2432                     is_plain_url, formats = _media_formats(src, media_type, f)
2433                     if is_plain_url:
2434                         # res attribute is not standard but seen several times
2435                         # in the wild
2436                         f.update({
2437                             'height': int_or_none(source_attributes.get('res')),
2438                             'format_id': source_attributes.get('label'),
2439                         })
2440                         f.update(formats[0])
2441                         media_info['formats'].append(f)
2442                     else:
2443                         media_info['formats'].extend(formats)
2444                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2445                     track_attributes = extract_attributes(track_tag)
2446                     kind = track_attributes.get('kind')
2447                     if not kind or kind in ('subtitles', 'captions'):
2448                         src = track_attributes.get('src')
2449                         if not src:
2450                             continue
2451                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2452                         media_info['subtitles'].setdefault(lang, []).append({
2453                             'url': absolute_url(src),
2454                         })
2455             for f in media_info['formats']:
2456                 f.setdefault('http_headers', {})['Referer'] = base_url
2457             if media_info['formats'] or media_info['subtitles']:
2458                 entries.append(media_info)
2459         return entries
2460
2461     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2462         formats = []
2463         hdcore_sign = 'hdcore=3.7.0'
2464         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2465         hds_host = hosts.get('hds')
2466         if hds_host:
2467             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2468         if 'hdcore=' not in f4m_url:
2469             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2470         f4m_formats = self._extract_f4m_formats(
2471             f4m_url, video_id, f4m_id='hds', fatal=False)
2472         for entry in f4m_formats:
2473             entry.update({'extra_param_to_segment_url': hdcore_sign})
2474         formats.extend(f4m_formats)
2475         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2476         hls_host = hosts.get('hls')
2477         if hls_host:
2478             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2479         formats.extend(self._extract_m3u8_formats(
2480             m3u8_url, video_id, 'mp4', 'm3u8_native',
2481             m3u8_id='hls', fatal=False))
2482         return formats
2483
2484     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2485         query = compat_urlparse.urlparse(url).query
2486         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2487         mobj = re.search(
2488             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2489         url_base = mobj.group('url')
2490         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2491         formats = []
2492
2493         def manifest_url(manifest):
2494             m_url = '%s/%s' % (http_base_url, manifest)
2495             if query:
2496                 m_url += '?%s' % query
2497             return m_url
2498
2499         if 'm3u8' not in skip_protocols:
2500             formats.extend(self._extract_m3u8_formats(
2501                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2502                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2503         if 'f4m' not in skip_protocols:
2504             formats.extend(self._extract_f4m_formats(
2505                 manifest_url('manifest.f4m'),
2506                 video_id, f4m_id='hds', fatal=False))
2507         if 'dash' not in skip_protocols:
2508             formats.extend(self._extract_mpd_formats(
2509                 manifest_url('manifest.mpd'),
2510                 video_id, mpd_id='dash', fatal=False))
2511         if re.search(r'(?:/smil:|\.smil)', url_base):
2512             if 'smil' not in skip_protocols:
2513                 rtmp_formats = self._extract_smil_formats(
2514                     manifest_url('jwplayer.smil'),
2515                     video_id, fatal=False)
2516                 for rtmp_format in rtmp_formats:
2517                     rtsp_format = rtmp_format.copy()
2518                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2519                     del rtsp_format['play_path']
2520                     del rtsp_format['ext']
2521                     rtsp_format.update({
2522                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2523                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2524                         'protocol': 'rtsp',
2525                     })
2526                     formats.extend([rtmp_format, rtsp_format])
2527         else:
2528             for protocol in ('rtmp', 'rtsp'):
2529                 if protocol not in skip_protocols:
2530                     formats.append({
2531                         'url': '%s:%s' % (protocol, url_base),
2532                         'format_id': protocol,
2533                         'protocol': protocol,
2534                     })
2535         return formats
2536
2537     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2538         mobj = re.search(
2539             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2540             webpage)
2541         if mobj:
2542             try:
2543                 jwplayer_data = self._parse_json(mobj.group('options'),
2544                                                  video_id=video_id,
2545                                                  transform_source=transform_source)
2546             except ExtractorError:
2547                 pass
2548             else:
2549                 if isinstance(jwplayer_data, dict):
2550                     return jwplayer_data
2551
2552     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2553         jwplayer_data = self._find_jwplayer_data(
2554             webpage, video_id, transform_source=js_to_json)
2555         return self._parse_jwplayer_data(
2556             jwplayer_data, video_id, *args, **kwargs)
2557
2558     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2559                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2560         # JWPlayer backward compatibility: flattened playlists
2561         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2562         if 'playlist' not in jwplayer_data:
2563             jwplayer_data = {'playlist': [jwplayer_data]}
2564
2565         entries = []
2566
2567         # JWPlayer backward compatibility: single playlist item
2568         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2569         if not isinstance(jwplayer_data['playlist'], list):
2570             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2571
2572         for video_data in jwplayer_data['playlist']:
2573             # JWPlayer backward compatibility: flattened sources
2574             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2575             if 'sources' not in video_data:
2576                 video_data['sources'] = [video_data]
2577
2578             this_video_id = video_id or video_data['mediaid']
2579
2580             formats = self._parse_jwplayer_formats(
2581                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2582                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2583
2584             subtitles = {}
2585             tracks = video_data.get('tracks')
2586             if tracks and isinstance(tracks, list):
2587                 for track in tracks:
2588                     if not isinstance(track, dict):
2589                         continue
2590                     track_kind = track.get('kind')
2591                     if not track_kind or not isinstance(track_kind, compat_str):
2592                         continue
2593                     if track_kind.lower() not in ('captions', 'subtitles'):
2594                         continue
2595                     track_url = urljoin(base_url, track.get('file'))
2596                     if not track_url:
2597                         continue
2598                     subtitles.setdefault(track.get('label') or 'en', []).append({
2599                         'url': self._proto_relative_url(track_url)
2600                     })
2601
2602             entry = {
2603                 'id': this_video_id,
2604                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2605                 'description': video_data.get('description'),
2606                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2607                 'timestamp': int_or_none(video_data.get('pubdate')),
2608                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2609                 'subtitles': subtitles,
2610             }
2611             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2612             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2613                 entry.update({
2614                     '_type': 'url_transparent',
2615                     'url': formats[0]['url'],
2616                 })
2617             else:
2618                 self._sort_formats(formats)
2619                 entry['formats'] = formats
2620             entries.append(entry)
2621         if len(entries) == 1:
2622             return entries[0]
2623         else:
2624             return self.playlist_result(entries)
2625
2626     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2627                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2628         urls = []
2629         formats = []
2630         for source in jwplayer_sources_data:
2631             if not isinstance(source, dict):
2632                 continue
2633             source_url = self._proto_relative_url(source.get('file'))
2634             if not source_url:
2635                 continue
2636             if base_url:
2637                 source_url = compat_urlparse.urljoin(base_url, source_url)
2638             if source_url in urls:
2639                 continue
2640             urls.append(source_url)
2641             source_type = source.get('type') or ''
2642             ext = mimetype2ext(source_type) or determine_ext(source_url)
2643             if source_type == 'hls' or ext == 'm3u8':
2644                 formats.extend(self._extract_m3u8_formats(
2645                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2646                     m3u8_id=m3u8_id, fatal=False))
2647             elif source_type == 'dash' or ext == 'mpd':
2648                 formats.extend(self._extract_mpd_formats(
2649                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2650             elif ext == 'smil':
2651                 formats.extend(self._extract_smil_formats(
2652                     source_url, video_id, fatal=False))
2653             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2654             elif source_type.startswith('audio') or ext in (
2655                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2656                 formats.append({
2657                     'url': source_url,
2658                     'vcodec': 'none',
2659                     'ext': ext,
2660                 })
2661             else:
2662                 height = int_or_none(source.get('height'))
2663                 if height is None:
2664                     # Often no height is provided but there is a label in
2665                     # format like "1080p", "720p SD", or 1080.
2666                     height = int_or_none(self._search_regex(
2667                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2668                         'height', default=None))
2669                 a_format = {
2670                     'url': source_url,
2671                     'width': int_or_none(source.get('width')),
2672                     'height': height,
2673                     'tbr': int_or_none(source.get('bitrate')),
2674                     'ext': ext,
2675                 }
2676                 if source_url.startswith('rtmp'):
2677                     a_format['ext'] = 'flv'
2678                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2679                     # of jwplayer.flash.swf
2680                     rtmp_url_parts = re.split(
2681                         r'((?:mp4|mp3|flv):)', source_url, 1)
2682                     if len(rtmp_url_parts) == 3:
2683                         rtmp_url, prefix, play_path = rtmp_url_parts
2684                         a_format.update({
2685                             'url': rtmp_url,
2686                             'play_path': prefix + play_path,
2687                         })
2688                     if rtmp_params:
2689                         a_format.update(rtmp_params)
2690                 formats.append(a_format)
2691         return formats
2692
2693     def _live_title(self, name):
2694         """ Generate the title for a live video """
2695         now = datetime.datetime.now()
2696         now_str = now.strftime('%Y-%m-%d %H:%M')
2697         return name + ' ' + now_str
2698
2699     def _int(self, v, name, fatal=False, **kwargs):
2700         res = int_or_none(v, **kwargs)
2701         if 'get_attr' in kwargs:
2702             print(getattr(v, kwargs['get_attr']))
2703         if res is None:
2704             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2705             if fatal:
2706                 raise ExtractorError(msg)
2707             else:
2708                 self._downloader.report_warning(msg)
2709         return res
2710
2711     def _float(self, v, name, fatal=False, **kwargs):
2712         res = float_or_none(v, **kwargs)
2713         if res is None:
2714             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2715             if fatal:
2716                 raise ExtractorError(msg)
2717             else:
2718                 self._downloader.report_warning(msg)
2719         return res
2720
2721     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2722                     path='/', secure=False, discard=False, rest={}, **kwargs):
2723         cookie = compat_cookiejar.Cookie(
2724             0, name, value, port, port is not None, domain, True,
2725             domain.startswith('.'), path, True, secure, expire_time,
2726             discard, None, None, rest)
2727         self._downloader.cookiejar.set_cookie(cookie)
2728
2729     def _get_cookies(self, url):
2730         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2731         req = sanitized_Request(url)
2732         self._downloader.cookiejar.add_cookie_header(req)
2733         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2734
2735     def get_testcases(self, include_onlymatching=False):
2736         t = getattr(self, '_TEST', None)
2737         if t:
2738             assert not hasattr(self, '_TESTS'), \
2739                 '%s has _TEST and _TESTS' % type(self).__name__
2740             tests = [t]
2741         else:
2742             tests = getattr(self, '_TESTS', [])
2743         for t in tests:
2744             if not include_onlymatching and t.get('only_matching', False):
2745                 continue
2746             t['name'] = type(self).__name__[:-len('IE')]
2747             yield t
2748
2749     def is_suitable(self, age_limit):
2750         """ Test whether the extractor is generally suitable for the given
2751         age limit (i.e. pornographic sites are not, all others usually are) """
2752
2753         any_restricted = False
2754         for tc in self.get_testcases(include_onlymatching=False):
2755             if tc.get('playlist', []):
2756                 tc = tc['playlist'][0]
2757             is_restricted = age_restricted(
2758                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2759             if not is_restricted:
2760                 return True
2761             any_restricted = any_restricted or is_restricted
2762         return not any_restricted
2763
2764     def extract_subtitles(self, *args, **kwargs):
2765         if (self._downloader.params.get('writesubtitles', False) or
2766                 self._downloader.params.get('listsubtitles')):
2767             return self._get_subtitles(*args, **kwargs)
2768         return {}
2769
2770     def _get_subtitles(self, *args, **kwargs):
2771         raise NotImplementedError('This method must be implemented by subclasses')
2772
2773     @staticmethod
2774     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2775         """ Merge subtitle items for one language. Items with duplicated URLs
2776         will be dropped. """
2777         list1_urls = set([item['url'] for item in subtitle_list1])
2778         ret = list(subtitle_list1)
2779         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2780         return ret
2781
2782     @classmethod
2783     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2784         """ Merge two subtitle dictionaries, language by language. """
2785         ret = dict(subtitle_dict1)
2786         for lang in subtitle_dict2:
2787             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2788         return ret
2789
2790     def extract_automatic_captions(self, *args, **kwargs):
2791         if (self._downloader.params.get('writeautomaticsub', False) or
2792                 self._downloader.params.get('listsubtitles')):
2793             return self._get_automatic_captions(*args, **kwargs)
2794         return {}
2795
2796     def _get_automatic_captions(self, *args, **kwargs):
2797         raise NotImplementedError('This method must be implemented by subclasses')
2798
2799     def mark_watched(self, *args, **kwargs):
2800         if (self._downloader.params.get('mark_watched', False) and
2801                 (self._get_login_info()[0] is not None or
2802                     self._downloader.params.get('cookiefile') is not None)):
2803             self._mark_watched(*args, **kwargs)
2804
2805     def _mark_watched(self, *args, **kwargs):
2806         raise NotImplementedError('This method must be implemented by subclasses')
2807
2808     def geo_verification_headers(self):
2809         headers = {}
2810         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2811         if geo_verification_proxy:
2812             headers['Ytdl-request-proxy'] = geo_verification_proxy
2813         return headers
2814
2815     def _generic_id(self, url):
2816         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2817
2818     def _generic_title(self, url):
2819         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2820
2821
2822 class SearchInfoExtractor(InfoExtractor):
2823     """
2824     Base class for paged search queries extractors.
2825     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2826     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2827     """
2828
2829     @classmethod
2830     def _make_valid_url(cls):
2831         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2832
2833     @classmethod
2834     def suitable(cls, url):
2835         return re.match(cls._make_valid_url(), url) is not None
2836
2837     def _real_extract(self, query):
2838         mobj = re.match(self._make_valid_url(), query)
2839         if mobj is None:
2840             raise ExtractorError('Invalid search query "%s"' % query)
2841
2842         prefix = mobj.group('prefix')
2843         query = mobj.group('query')
2844         if prefix == '':
2845             return self._get_n_results(query, 1)
2846         elif prefix == 'all':
2847             return self._get_n_results(query, self._MAX_RESULTS)
2848         else:
2849             n = int(prefix)
2850             if n <= 0:
2851                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2852             elif n > self._MAX_RESULTS:
2853                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2854                 n = self._MAX_RESULTS
2855             return self._get_n_results(query, n)
2856
2857     def _get_n_results(self, query, n):
2858         """Get a specified number of results for a query"""
2859         raise NotImplementedError('This method must be implemented by subclasses')
2860
2861     @property
2862     def SEARCH_KEY(self):
2863         return self._SEARCH_KEY