[f4m] Prefer baseURL for relative URLs (closes #14660)
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_http_client,
23     compat_os_name,
24     compat_str,
25     compat_urllib_error,
26     compat_urllib_parse_unquote,
27     compat_urllib_parse_urlencode,
28     compat_urllib_request,
29     compat_urlparse,
30     compat_xml_parse_error,
31 )
32 from ..downloader.f4m import (
33     get_base_url,
34     remove_encrypted_media,
35 )
36 from ..utils import (
37     NO_DEFAULT,
38     age_restricted,
39     base_url,
40     bug_reports_message,
41     clean_html,
42     compiled_regex_type,
43     determine_ext,
44     determine_protocol,
45     error_to_compat_str,
46     ExtractorError,
47     extract_attributes,
48     fix_xml_ampersands,
49     float_or_none,
50     GeoRestrictedError,
51     GeoUtils,
52     int_or_none,
53     js_to_json,
54     mimetype2ext,
55     orderedSet,
56     parse_codecs,
57     parse_duration,
58     parse_iso8601,
59     parse_m3u8_attributes,
60     RegexNotFoundError,
61     sanitized_Request,
62     sanitize_filename,
63     unescapeHTML,
64     unified_strdate,
65     unified_timestamp,
66     update_Request,
67     update_url_query,
68     urljoin,
69     url_basename,
70     xpath_element,
71     xpath_text,
72     xpath_with_ns,
73 )
74
75
76 class InfoExtractor(object):
77     """Information Extractor class.
78
79     Information extractors are the classes that, given a URL, extract
80     information about the video (or videos) the URL refers to. This
81     information includes the real video URL, the video title, author and
82     others. The information is stored in a dictionary which is then
83     passed to the YoutubeDL. The YoutubeDL processes this
84     information possibly downloading the video to the file system, among
85     other possible outcomes.
86
87     The type field determines the type of the result.
88     By far the most common value (and the default if _type is missing) is
89     "video", which indicates a single video.
90
91     For a video, the dictionaries must include the following fields:
92
93     id:             Video identifier.
94     title:          Video title, unescaped.
95
96     Additionally, it must contain either a formats entry or a url one:
97
98     formats:        A list of dictionaries for each format available, ordered
99                     from worst to best quality.
100
101                     Potential fields:
102                     * url        Mandatory. The URL of the video file
103                     * manifest_url
104                                  The URL of the manifest file in case of
105                                  fragmented media (DASH, hls, hds)
106                     * ext        Will be calculated from URL if missing
107                     * format     A human-readable description of the format
108                                  ("mp4 container with h264/opus").
109                                  Calculated from the format_id, width, height.
110                                  and format_note fields if missing.
111                     * format_id  A short description of the format
112                                  ("mp4_h264_opus" or "19").
113                                 Technically optional, but strongly recommended.
114                     * format_note Additional info about the format
115                                  ("3D" or "DASH video")
116                     * width      Width of the video, if known
117                     * height     Height of the video, if known
118                     * resolution Textual description of width and height
119                     * tbr        Average bitrate of audio and video in KBit/s
120                     * abr        Average audio bitrate in KBit/s
121                     * acodec     Name of the audio codec in use
122                     * asr        Audio sampling rate in Hertz
123                     * vbr        Average video bitrate in KBit/s
124                     * fps        Frame rate
125                     * vcodec     Name of the video codec in use
126                     * container  Name of the container format
127                     * filesize   The number of bytes, if known in advance
128                     * filesize_approx  An estimate for the number of bytes
129                     * player_url SWF Player URL (used for rtmpdump).
130                     * protocol   The protocol that will be used for the actual
131                                  download, lower-case.
132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
133                                  "m3u8", "m3u8_native" or "http_dash_segments".
134                     * fragment_base_url
135                                  Base URL for fragments. Each fragment's path
136                                  value (if present) will be relative to
137                                  this URL.
138                     * fragments  A list of fragments of a fragmented media.
139                                  Each fragment entry must contain either an url
140                                  or a path. If an url is present it should be
141                                  considered by a client. Otherwise both path and
142                                  fragment_base_url must be present. Here is
143                                  the list of all potential fields:
144                                  * "url" - fragment's URL
145                                  * "path" - fragment's path relative to
146                                             fragment_base_url
147                                  * "duration" (optional, int or float)
148                                  * "filesize" (optional, int)
149                     * preference Order number of this format. If this field is
150                                  present and not None, the formats get sorted
151                                  by this field, regardless of all other values.
152                                  -1 for default (order by other properties),
153                                  -2 or smaller for less than default.
154                                  < -1000 to hide the format (if there is
155                                     another one which is strictly better)
156                     * language   Language code, e.g. "de" or "en-US".
157                     * language_preference  Is this in the language mentioned in
158                                  the URL?
159                                  10 if it's what the URL is about,
160                                  -1 for default (don't know),
161                                  -10 otherwise, other values reserved for now.
162                     * quality    Order number of the video quality of this
163                                  format, irrespective of the file format.
164                                  -1 for default (order by other properties),
165                                  -2 or smaller for less than default.
166                     * source_preference  Order number for this video source
167                                   (quality takes higher priority)
168                                  -1 for default (order by other properties),
169                                  -2 or smaller for less than default.
170                     * http_headers  A dictionary of additional HTTP headers
171                                  to add to the request.
172                     * stretched_ratio  If given and not 1, indicates that the
173                                  video's pixels are not square.
174                                  width : height ratio as float.
175                     * no_resume  The server does not support resuming the
176                                  (HTTP or RTMP) download. Boolean.
177
178     url:            Final video URL.
179     ext:            Video filename extension.
180     format:         The video format, defaults to ext (used for --get-format)
181     player_url:     SWF Player URL (used for rtmpdump).
182
183     The following fields are optional:
184
185     alt_title:      A secondary title of the video.
186     display_id      An alternative identifier for the video, not necessarily
187                     unique, but available before title. Typically, id is
188                     something like "4234987", title "Dancing naked mole rats",
189                     and display_id "dancing-naked-mole-rats"
190     thumbnails:     A list of dictionaries, with the following entries:
191                         * "id" (optional, string) - Thumbnail format ID
192                         * "url"
193                         * "preference" (optional, int) - quality of the image
194                         * "width" (optional, int)
195                         * "height" (optional, int)
196                         * "resolution" (optional, string "{width}x{height"},
197                                         deprecated)
198                         * "filesize" (optional, int)
199     thumbnail:      Full URL to a video thumbnail image.
200     description:    Full video description.
201     uploader:       Full name of the video uploader.
202     license:        License name the video is licensed under.
203     creator:        The creator of the video.
204     release_date:   The date (YYYYMMDD) when the video was released.
205     timestamp:      UNIX timestamp of the moment the video became available.
206     upload_date:    Video upload date (YYYYMMDD).
207                     If not explicitly set, calculated from timestamp.
208     uploader_id:    Nickname or id of the video uploader.
209     uploader_url:   Full URL to a personal webpage of the video uploader.
210     location:       Physical location where the video was filmed.
211     subtitles:      The available subtitles as a dictionary in the format
212                     {tag: subformats}. "tag" is usually a language code, and
213                     "subformats" is a list sorted from lower to higher
214                     preference, each element is a dictionary with the "ext"
215                     entry and one of:
216                         * "data": The subtitles file contents
217                         * "url": A URL pointing to the subtitles file
218                     "ext" will be calculated from URL if missing
219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
220                     automatically generated captions
221     duration:       Length of the video in seconds, as an integer or float.
222     view_count:     How many users have watched the video on the platform.
223     like_count:     Number of positive ratings of the video
224     dislike_count:  Number of negative ratings of the video
225     repost_count:   Number of reposts of the video
226     average_rating: Average rating give by users, the scale used depends on the webpage
227     comment_count:  Number of comments on the video
228     comments:       A list of comments, each with one or more of the following
229                     properties (all but one of text or html optional):
230                         * "author" - human-readable name of the comment author
231                         * "author_id" - user ID of the comment author
232                         * "id" - Comment ID
233                         * "html" - Comment as HTML
234                         * "text" - Plain text of the comment
235                         * "timestamp" - UNIX timestamp of comment
236                         * "parent" - ID of the comment this one is replying to.
237                                      Set to "root" to indicate that this is a
238                                      comment to the original video.
239     age_limit:      Age restriction for the video, as an integer (years)
240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
241                     should allow to get the same result again. (It will be set
242                     by YoutubeDL if it's missing)
243     categories:     A list of categories that the video falls in, for example
244                     ["Sports", "Berlin"]
245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
246     is_live:        True, False, or None (=unknown). Whether this video is a
247                     live stream that goes on instead of a fixed-length video.
248     start_time:     Time in seconds where the reproduction should start, as
249                     specified in the URL.
250     end_time:       Time in seconds where the reproduction should end, as
251                     specified in the URL.
252     chapters:       A list of dictionaries, with the following entries:
253                         * "start_time" - The start time of the chapter in seconds
254                         * "end_time" - The end time of the chapter in seconds
255                         * "title" (optional, string)
256
257     The following fields should only be used when the video belongs to some logical
258     chapter or section:
259
260     chapter:        Name or title of the chapter the video belongs to.
261     chapter_number: Number of the chapter the video belongs to, as an integer.
262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
263
264     The following fields should only be used when the video is an episode of some
265     series, programme or podcast:
266
267     series:         Title of the series or programme the video episode belongs to.
268     season:         Title of the season the video episode belongs to.
269     season_number:  Number of the season the video episode belongs to, as an integer.
270     season_id:      Id of the season the video episode belongs to, as a unicode string.
271     episode:        Title of the video episode. Unlike mandatory video title field,
272                     this field should denote the exact title of the video episode
273                     without any kind of decoration.
274     episode_number: Number of the video episode within a season, as an integer.
275     episode_id:     Id of the video episode, as a unicode string.
276
277     The following fields should only be used when the media is a track or a part of
278     a music album:
279
280     track:          Title of the track.
281     track_number:   Number of the track within an album or a disc, as an integer.
282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
283                     as a unicode string.
284     artist:         Artist(s) of the track.
285     genre:          Genre(s) of the track.
286     album:          Title of the album the track belongs to.
287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
288     album_artist:   List of all artists appeared on the album (e.g.
289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
290                     and compilations).
291     disc_number:    Number of the disc or other physical medium the track belongs to,
292                     as an integer.
293     release_year:   Year (YYYY) when the album was released.
294
295     Unless mentioned otherwise, the fields should be Unicode strings.
296
297     Unless mentioned otherwise, None is equivalent to absence of information.
298
299
300     _type "playlist" indicates multiple videos.
301     There must be a key "entries", which is a list, an iterable, or a PagedList
302     object, each element of which is a valid dictionary by this specification.
303
304     Additionally, playlists can have "title", "description" and "id" attributes
305     with the same semantics as videos (see above).
306
307
308     _type "multi_video" indicates that there are multiple videos that
309     form a single show, for examples multiple acts of an opera or TV episode.
310     It must have an entries key like a playlist and contain all the keys
311     required for a video at the same time.
312
313
314     _type "url" indicates that the video must be extracted from another
315     location, possibly by a different extractor. Its only required key is:
316     "url" - the next URL to extract.
317     The key "ie_key" can be set to the class name (minus the trailing "IE",
318     e.g. "Youtube") if the extractor class is known in advance.
319     Additionally, the dictionary may have any properties of the resolved entity
320     known in advance, for example "title" if the title of the referred video is
321     known ahead of time.
322
323
324     _type "url_transparent" entities have the same specification as "url", but
325     indicate that the given additional information is more precise than the one
326     associated with the resolved URL.
327     This is useful when a site employs a video service that hosts the video and
328     its technical metadata, but that video service does not embed a useful
329     title, description etc.
330
331
332     Subclasses of this one should re-define the _real_initialize() and
333     _real_extract() methods and define a _VALID_URL regexp.
334     Probably, they should also be added to the list of extractors.
335
336     _GEO_BYPASS attribute may be set to False in order to disable
337     geo restriction bypass mechanisms for a particular extractor.
338     Though it won't disable explicit geo restriction bypass based on
339     country code provided with geo_bypass_country. (experimental)
340
341     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
342     countries for this extractor. One of these countries will be used by
343     geo restriction bypass mechanism right away in order to bypass
344     geo restriction, of course, if the mechanism is not disabled. (experimental)
345
346     NB: both these geo attributes are experimental and may change in future
347     or be completely removed.
348
349     Finally, the _WORKING attribute should be set to False for broken IEs
350     in order to warn the users and skip the tests.
351     """
352
353     _ready = False
354     _downloader = None
355     _x_forwarded_for_ip = None
356     _GEO_BYPASS = True
357     _GEO_COUNTRIES = None
358     _WORKING = True
359
360     def __init__(self, downloader=None):
361         """Constructor. Receives an optional downloader."""
362         self._ready = False
363         self._x_forwarded_for_ip = None
364         self.set_downloader(downloader)
365
366     @classmethod
367     def suitable(cls, url):
368         """Receives a URL and returns True if suitable for this IE."""
369
370         # This does not use has/getattr intentionally - we want to know whether
371         # we have cached the regexp for *this* class, whereas getattr would also
372         # match the superclass
373         if '_VALID_URL_RE' not in cls.__dict__:
374             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
375         return cls._VALID_URL_RE.match(url) is not None
376
377     @classmethod
378     def _match_id(cls, url):
379         if '_VALID_URL_RE' not in cls.__dict__:
380             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
381         m = cls._VALID_URL_RE.match(url)
382         assert m
383         return compat_str(m.group('id'))
384
385     @classmethod
386     def working(cls):
387         """Getter method for _WORKING."""
388         return cls._WORKING
389
390     def initialize(self):
391         """Initializes an instance (authentication, etc)."""
392         self._initialize_geo_bypass(self._GEO_COUNTRIES)
393         if not self._ready:
394             self._real_initialize()
395             self._ready = True
396
397     def _initialize_geo_bypass(self, countries):
398         """
399         Initialize geo restriction bypass mechanism.
400
401         This method is used to initialize geo bypass mechanism based on faking
402         X-Forwarded-For HTTP header. A random country from provided country list
403         is selected and a random IP belonging to this country is generated. This
404         IP will be passed as X-Forwarded-For HTTP header in all subsequent
405         HTTP requests.
406
407         This method will be used for initial geo bypass mechanism initialization
408         during the instance initialization with _GEO_COUNTRIES.
409
410         You may also manually call it from extractor's code if geo countries
411         information is not available beforehand (e.g. obtained during
412         extraction) or due to some another reason.
413         """
414         if not self._x_forwarded_for_ip:
415             country_code = self._downloader.params.get('geo_bypass_country', None)
416             # If there is no explicit country for geo bypass specified and
417             # the extractor is known to be geo restricted let's fake IP
418             # as X-Forwarded-For right away.
419             if (not country_code and
420                     self._GEO_BYPASS and
421                     self._downloader.params.get('geo_bypass', True) and
422                     countries):
423                 country_code = random.choice(countries)
424             if country_code:
425                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
426                 if self._downloader.params.get('verbose', False):
427                     self._downloader.to_screen(
428                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
429                         % (self._x_forwarded_for_ip, country_code.upper()))
430
431     def extract(self, url):
432         """Extracts URL information and returns it in list of dicts."""
433         try:
434             for _ in range(2):
435                 try:
436                     self.initialize()
437                     ie_result = self._real_extract(url)
438                     if self._x_forwarded_for_ip:
439                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
440                     return ie_result
441                 except GeoRestrictedError as e:
442                     if self.__maybe_fake_ip_and_retry(e.countries):
443                         continue
444                     raise
445         except ExtractorError:
446             raise
447         except compat_http_client.IncompleteRead as e:
448             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
449         except (KeyError, StopIteration) as e:
450             raise ExtractorError('An extractor error has occurred.', cause=e)
451
452     def __maybe_fake_ip_and_retry(self, countries):
453         if (not self._downloader.params.get('geo_bypass_country', None) and
454                 self._GEO_BYPASS and
455                 self._downloader.params.get('geo_bypass', True) and
456                 not self._x_forwarded_for_ip and
457                 countries):
458             country_code = random.choice(countries)
459             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
460             if self._x_forwarded_for_ip:
461                 self.report_warning(
462                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
463                     % (self._x_forwarded_for_ip, country_code.upper()))
464                 return True
465         return False
466
467     def set_downloader(self, downloader):
468         """Sets the downloader for this IE."""
469         self._downloader = downloader
470
471     def _real_initialize(self):
472         """Real initialization process. Redefine in subclasses."""
473         pass
474
475     def _real_extract(self, url):
476         """Real extraction process. Redefine in subclasses."""
477         pass
478
479     @classmethod
480     def ie_key(cls):
481         """A string for getting the InfoExtractor with get_info_extractor"""
482         return compat_str(cls.__name__[:-2])
483
484     @property
485     def IE_NAME(self):
486         return compat_str(type(self).__name__[:-2])
487
488     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
489         """ Returns the response handle """
490         if note is None:
491             self.report_download_webpage(video_id)
492         elif note is not False:
493             if video_id is None:
494                 self.to_screen('%s' % (note,))
495             else:
496                 self.to_screen('%s: %s' % (video_id, note))
497         if isinstance(url_or_request, compat_urllib_request.Request):
498             url_or_request = update_Request(
499                 url_or_request, data=data, headers=headers, query=query)
500         else:
501             if query:
502                 url_or_request = update_url_query(url_or_request, query)
503             if data is not None or headers:
504                 url_or_request = sanitized_Request(url_or_request, data, headers)
505         try:
506             return self._downloader.urlopen(url_or_request)
507         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
508             if errnote is False:
509                 return False
510             if errnote is None:
511                 errnote = 'Unable to download webpage'
512
513             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
514             if fatal:
515                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
516             else:
517                 self._downloader.report_warning(errmsg)
518                 return False
519
520     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
521         """ Returns a tuple (page content as string, URL handle) """
522         # Strip hashes from the URL (#1038)
523         if isinstance(url_or_request, (compat_str, str)):
524             url_or_request = url_or_request.partition('#')[0]
525
526         # Some sites check X-Forwarded-For HTTP header in order to figure out
527         # the origin of the client behind proxy. This allows bypassing geo
528         # restriction by faking this header's value to IP that belongs to some
529         # geo unrestricted country. We will do so once we encounter any
530         # geo restriction error.
531         if self._x_forwarded_for_ip:
532             if 'X-Forwarded-For' not in headers:
533                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
534
535         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
536         if urlh is False:
537             assert not fatal
538             return False
539         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
540         return (content, urlh)
541
542     @staticmethod
543     def _guess_encoding_from_content(content_type, webpage_bytes):
544         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
545         if m:
546             encoding = m.group(1)
547         else:
548             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
549                           webpage_bytes[:1024])
550             if m:
551                 encoding = m.group(1).decode('ascii')
552             elif webpage_bytes.startswith(b'\xff\xfe'):
553                 encoding = 'utf-16'
554             else:
555                 encoding = 'utf-8'
556
557         return encoding
558
559     def __check_blocked(self, content):
560         first_block = content[:512]
561         if ('<title>Access to this site is blocked</title>' in content and
562                 'Websense' in first_block):
563             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
564             blocked_iframe = self._html_search_regex(
565                 r'<iframe src="([^"]+)"', content,
566                 'Websense information URL', default=None)
567             if blocked_iframe:
568                 msg += ' Visit %s for more details' % blocked_iframe
569             raise ExtractorError(msg, expected=True)
570         if '<title>The URL you requested has been blocked</title>' in first_block:
571             msg = (
572                 'Access to this webpage has been blocked by Indian censorship. '
573                 'Use a VPN or proxy server (with --proxy) to route around it.')
574             block_msg = self._html_search_regex(
575                 r'</h1><p>(.*?)</p>',
576                 content, 'block message', default=None)
577             if block_msg:
578                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
579             raise ExtractorError(msg, expected=True)
580         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
581                 'blocklist.rkn.gov.ru' in content):
582             raise ExtractorError(
583                 'Access to this webpage has been blocked by decision of the Russian government. '
584                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
585                 expected=True)
586
587     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
588         content_type = urlh.headers.get('Content-Type', '')
589         webpage_bytes = urlh.read()
590         if prefix is not None:
591             webpage_bytes = prefix + webpage_bytes
592         if not encoding:
593             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
594         if self._downloader.params.get('dump_intermediate_pages', False):
595             try:
596                 url = url_or_request.get_full_url()
597             except AttributeError:
598                 url = url_or_request
599             self.to_screen('Dumping request to ' + url)
600             dump = base64.b64encode(webpage_bytes).decode('ascii')
601             self._downloader.to_screen(dump)
602         if self._downloader.params.get('write_pages', False):
603             try:
604                 url = url_or_request.get_full_url()
605             except AttributeError:
606                 url = url_or_request
607             basen = '%s_%s' % (video_id, url)
608             if len(basen) > 240:
609                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
610                 basen = basen[:240 - len(h)] + h
611             raw_filename = basen + '.dump'
612             filename = sanitize_filename(raw_filename, restricted=True)
613             self.to_screen('Saving request to ' + filename)
614             # Working around MAX_PATH limitation on Windows (see
615             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
616             if compat_os_name == 'nt':
617                 absfilepath = os.path.abspath(filename)
618                 if len(absfilepath) > 259:
619                     filename = '\\\\?\\' + absfilepath
620             with open(filename, 'wb') as outf:
621                 outf.write(webpage_bytes)
622
623         try:
624             content = webpage_bytes.decode(encoding, 'replace')
625         except LookupError:
626             content = webpage_bytes.decode('utf-8', 'replace')
627
628         self.__check_blocked(content)
629
630         return content
631
632     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
633         """ Returns the data of the page as a string """
634         success = False
635         try_count = 0
636         while success is False:
637             try:
638                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
639                 success = True
640             except compat_http_client.IncompleteRead as e:
641                 try_count += 1
642                 if try_count >= tries:
643                     raise e
644                 self._sleep(timeout, video_id)
645         if res is False:
646             return res
647         else:
648             content, _ = res
649             return content
650
651     def _download_xml(self, url_or_request, video_id,
652                       note='Downloading XML', errnote='Unable to download XML',
653                       transform_source=None, fatal=True, encoding=None,
654                       data=None, headers={}, query={}):
655         """Return the xml as an xml.etree.ElementTree.Element"""
656         xml_string = self._download_webpage(
657             url_or_request, video_id, note, errnote, fatal=fatal,
658             encoding=encoding, data=data, headers=headers, query=query)
659         if xml_string is False:
660             return xml_string
661         return self._parse_xml(
662             xml_string, video_id, transform_source=transform_source,
663             fatal=fatal)
664
665     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
666         if transform_source:
667             xml_string = transform_source(xml_string)
668         try:
669             return compat_etree_fromstring(xml_string.encode('utf-8'))
670         except compat_xml_parse_error as ve:
671             errmsg = '%s: Failed to parse XML ' % video_id
672             if fatal:
673                 raise ExtractorError(errmsg, cause=ve)
674             else:
675                 self.report_warning(errmsg + str(ve))
676
677     def _download_json(self, url_or_request, video_id,
678                        note='Downloading JSON metadata',
679                        errnote='Unable to download JSON metadata',
680                        transform_source=None,
681                        fatal=True, encoding=None, data=None, headers={}, query={}):
682         json_string = self._download_webpage(
683             url_or_request, video_id, note, errnote, fatal=fatal,
684             encoding=encoding, data=data, headers=headers, query=query)
685         if (not fatal) and json_string is False:
686             return None
687         return self._parse_json(
688             json_string, video_id, transform_source=transform_source, fatal=fatal)
689
690     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
691         if transform_source:
692             json_string = transform_source(json_string)
693         try:
694             return json.loads(json_string)
695         except ValueError as ve:
696             errmsg = '%s: Failed to parse JSON ' % video_id
697             if fatal:
698                 raise ExtractorError(errmsg, cause=ve)
699             else:
700                 self.report_warning(errmsg + str(ve))
701
702     def report_warning(self, msg, video_id=None):
703         idstr = '' if video_id is None else '%s: ' % video_id
704         self._downloader.report_warning(
705             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
706
707     def to_screen(self, msg):
708         """Print msg to screen, prefixing it with '[ie_name]'"""
709         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
710
711     def report_extraction(self, id_or_name):
712         """Report information extraction."""
713         self.to_screen('%s: Extracting information' % id_or_name)
714
715     def report_download_webpage(self, video_id):
716         """Report webpage download."""
717         self.to_screen('%s: Downloading webpage' % video_id)
718
719     def report_age_confirmation(self):
720         """Report attempt to confirm age."""
721         self.to_screen('Confirming age')
722
723     def report_login(self):
724         """Report attempt to log in."""
725         self.to_screen('Logging in')
726
727     @staticmethod
728     def raise_login_required(msg='This video is only available for registered users'):
729         raise ExtractorError(
730             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
731             expected=True)
732
733     @staticmethod
734     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
735         raise GeoRestrictedError(msg, countries=countries)
736
737     # Methods for following #608
738     @staticmethod
739     def url_result(url, ie=None, video_id=None, video_title=None):
740         """Returns a URL that points to a page that should be processed"""
741         # TODO: ie should be the class used for getting the info
742         video_info = {'_type': 'url',
743                       'url': url,
744                       'ie_key': ie}
745         if video_id is not None:
746             video_info['id'] = video_id
747         if video_title is not None:
748             video_info['title'] = video_title
749         return video_info
750
751     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
752         urls = orderedSet(
753             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
754             for m in matches)
755         return self.playlist_result(
756             urls, playlist_id=playlist_id, playlist_title=playlist_title)
757
758     @staticmethod
759     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
760         """Returns a playlist"""
761         video_info = {'_type': 'playlist',
762                       'entries': entries}
763         if playlist_id:
764             video_info['id'] = playlist_id
765         if playlist_title:
766             video_info['title'] = playlist_title
767         if playlist_description:
768             video_info['description'] = playlist_description
769         return video_info
770
771     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
772         """
773         Perform a regex search on the given string, using a single or a list of
774         patterns returning the first matching group.
775         In case of failure return a default value or raise a WARNING or a
776         RegexNotFoundError, depending on fatal, specifying the field name.
777         """
778         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
779             mobj = re.search(pattern, string, flags)
780         else:
781             for p in pattern:
782                 mobj = re.search(p, string, flags)
783                 if mobj:
784                     break
785
786         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
787             _name = '\033[0;34m%s\033[0m' % name
788         else:
789             _name = name
790
791         if mobj:
792             if group is None:
793                 # return the first matching group
794                 return next(g for g in mobj.groups() if g is not None)
795             else:
796                 return mobj.group(group)
797         elif default is not NO_DEFAULT:
798             return default
799         elif fatal:
800             raise RegexNotFoundError('Unable to extract %s' % _name)
801         else:
802             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
803             return None
804
805     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
806         """
807         Like _search_regex, but strips HTML tags and unescapes entities.
808         """
809         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
810         if res:
811             return clean_html(res).strip()
812         else:
813             return res
814
815     def _get_netrc_login_info(self, netrc_machine=None):
816         username = None
817         password = None
818         netrc_machine = netrc_machine or self._NETRC_MACHINE
819
820         if self._downloader.params.get('usenetrc', False):
821             try:
822                 info = netrc.netrc().authenticators(netrc_machine)
823                 if info is not None:
824                     username = info[0]
825                     password = info[2]
826                 else:
827                     raise netrc.NetrcParseError(
828                         'No authenticators for %s' % netrc_machine)
829             except (IOError, netrc.NetrcParseError) as err:
830                 self._downloader.report_warning(
831                     'parsing .netrc: %s' % error_to_compat_str(err))
832
833         return username, password
834
835     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
836         """
837         Get the login info as (username, password)
838         First look for the manually specified credentials using username_option
839         and password_option as keys in params dictionary. If no such credentials
840         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
841         value.
842         If there's no info available, return (None, None)
843         """
844         if self._downloader is None:
845             return (None, None)
846
847         downloader_params = self._downloader.params
848
849         # Attempt to use provided username and password or .netrc data
850         if downloader_params.get(username_option) is not None:
851             username = downloader_params[username_option]
852             password = downloader_params[password_option]
853         else:
854             username, password = self._get_netrc_login_info(netrc_machine)
855
856         return username, password
857
858     def _get_tfa_info(self, note='two-factor verification code'):
859         """
860         Get the two-factor authentication info
861         TODO - asking the user will be required for sms/phone verify
862         currently just uses the command line option
863         If there's no info available, return None
864         """
865         if self._downloader is None:
866             return None
867         downloader_params = self._downloader.params
868
869         if downloader_params.get('twofactor') is not None:
870             return downloader_params['twofactor']
871
872         return compat_getpass('Type %s and press [Return]: ' % note)
873
874     # Helper functions for extracting OpenGraph info
875     @staticmethod
876     def _og_regexes(prop):
877         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
878         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
879                        % {'prop': re.escape(prop)})
880         template = r'<meta[^>]+?%s[^>]+?%s'
881         return [
882             template % (property_re, content_re),
883             template % (content_re, property_re),
884         ]
885
886     @staticmethod
887     def _meta_regex(prop):
888         return r'''(?isx)<meta
889                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
890                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
891
892     def _og_search_property(self, prop, html, name=None, **kargs):
893         if not isinstance(prop, (list, tuple)):
894             prop = [prop]
895         if name is None:
896             name = 'OpenGraph %s' % prop[0]
897         og_regexes = []
898         for p in prop:
899             og_regexes.extend(self._og_regexes(p))
900         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
901         if escaped is None:
902             return None
903         return unescapeHTML(escaped)
904
905     def _og_search_thumbnail(self, html, **kargs):
906         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
907
908     def _og_search_description(self, html, **kargs):
909         return self._og_search_property('description', html, fatal=False, **kargs)
910
911     def _og_search_title(self, html, **kargs):
912         return self._og_search_property('title', html, **kargs)
913
914     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
915         regexes = self._og_regexes('video') + self._og_regexes('video:url')
916         if secure:
917             regexes = self._og_regexes('video:secure_url') + regexes
918         return self._html_search_regex(regexes, html, name, **kargs)
919
920     def _og_search_url(self, html, **kargs):
921         return self._og_search_property('url', html, **kargs)
922
923     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
924         if not isinstance(name, (list, tuple)):
925             name = [name]
926         if display_name is None:
927             display_name = name[0]
928         return self._html_search_regex(
929             [self._meta_regex(n) for n in name],
930             html, display_name, fatal=fatal, group='content', **kwargs)
931
932     def _dc_search_uploader(self, html):
933         return self._html_search_meta('dc.creator', html, 'uploader')
934
935     def _rta_search(self, html):
936         # See http://www.rtalabel.org/index.php?content=howtofaq#single
937         if re.search(r'(?ix)<meta\s+name="rating"\s+'
938                      r'     content="RTA-5042-1996-1400-1577-RTA"',
939                      html):
940             return 18
941         return 0
942
943     def _media_rating_search(self, html):
944         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
945         rating = self._html_search_meta('rating', html)
946
947         if not rating:
948             return None
949
950         RATING_TABLE = {
951             'safe for kids': 0,
952             'general': 8,
953             '14 years': 14,
954             'mature': 17,
955             'restricted': 19,
956         }
957         return RATING_TABLE.get(rating.lower())
958
959     def _family_friendly_search(self, html):
960         # See http://schema.org/VideoObject
961         family_friendly = self._html_search_meta(
962             'isFamilyFriendly', html, default=None)
963
964         if not family_friendly:
965             return None
966
967         RATING_TABLE = {
968             '1': 0,
969             'true': 0,
970             '0': 18,
971             'false': 18,
972         }
973         return RATING_TABLE.get(family_friendly.lower())
974
975     def _twitter_search_player(self, html):
976         return self._html_search_meta('twitter:player', html,
977                                       'twitter card player')
978
979     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
980         json_ld = self._search_regex(
981             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
982             html, 'JSON-LD', group='json_ld', **kwargs)
983         default = kwargs.get('default', NO_DEFAULT)
984         if not json_ld:
985             return default if default is not NO_DEFAULT else {}
986         # JSON-LD may be malformed and thus `fatal` should be respected.
987         # At the same time `default` may be passed that assumes `fatal=False`
988         # for _search_regex. Let's simulate the same behavior here as well.
989         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
990         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
991
992     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
993         if isinstance(json_ld, compat_str):
994             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
995         if not json_ld:
996             return {}
997         info = {}
998         if not isinstance(json_ld, (list, tuple, dict)):
999             return info
1000         if isinstance(json_ld, dict):
1001             json_ld = [json_ld]
1002
1003         def extract_video_object(e):
1004             assert e['@type'] == 'VideoObject'
1005             info.update({
1006                 'url': e.get('contentUrl'),
1007                 'title': unescapeHTML(e.get('name')),
1008                 'description': unescapeHTML(e.get('description')),
1009                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1010                 'duration': parse_duration(e.get('duration')),
1011                 'timestamp': unified_timestamp(e.get('uploadDate')),
1012                 'filesize': float_or_none(e.get('contentSize')),
1013                 'tbr': int_or_none(e.get('bitrate')),
1014                 'width': int_or_none(e.get('width')),
1015                 'height': int_or_none(e.get('height')),
1016                 'view_count': int_or_none(e.get('interactionCount')),
1017             })
1018
1019         for e in json_ld:
1020             if e.get('@context') == 'http://schema.org':
1021                 item_type = e.get('@type')
1022                 if expected_type is not None and expected_type != item_type:
1023                     return info
1024                 if item_type in ('TVEpisode', 'Episode'):
1025                     info.update({
1026                         'episode': unescapeHTML(e.get('name')),
1027                         'episode_number': int_or_none(e.get('episodeNumber')),
1028                         'description': unescapeHTML(e.get('description')),
1029                     })
1030                     part_of_season = e.get('partOfSeason')
1031                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1032                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1033                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1034                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1035                         info['series'] = unescapeHTML(part_of_series.get('name'))
1036                 elif item_type == 'Article':
1037                     info.update({
1038                         'timestamp': parse_iso8601(e.get('datePublished')),
1039                         'title': unescapeHTML(e.get('headline')),
1040                         'description': unescapeHTML(e.get('articleBody')),
1041                     })
1042                 elif item_type == 'VideoObject':
1043                     extract_video_object(e)
1044                     continue
1045                 video = e.get('video')
1046                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1047                     extract_video_object(video)
1048                 break
1049         return dict((k, v) for k, v in info.items() if v is not None)
1050
1051     @staticmethod
1052     def _hidden_inputs(html):
1053         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1054         hidden_inputs = {}
1055         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1056             attrs = extract_attributes(input)
1057             if not input:
1058                 continue
1059             if attrs.get('type') not in ('hidden', 'submit'):
1060                 continue
1061             name = attrs.get('name') or attrs.get('id')
1062             value = attrs.get('value')
1063             if name and value is not None:
1064                 hidden_inputs[name] = value
1065         return hidden_inputs
1066
1067     def _form_hidden_inputs(self, form_id, html):
1068         form = self._search_regex(
1069             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1070             html, '%s form' % form_id, group='form')
1071         return self._hidden_inputs(form)
1072
1073     def _sort_formats(self, formats, field_preference=None):
1074         if not formats:
1075             raise ExtractorError('No video formats found')
1076
1077         for f in formats:
1078             # Automatically determine tbr when missing based on abr and vbr (improves
1079             # formats sorting in some cases)
1080             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1081                 f['tbr'] = f['abr'] + f['vbr']
1082
1083         def _formats_key(f):
1084             # TODO remove the following workaround
1085             from ..utils import determine_ext
1086             if not f.get('ext') and 'url' in f:
1087                 f['ext'] = determine_ext(f['url'])
1088
1089             if isinstance(field_preference, (list, tuple)):
1090                 return tuple(
1091                     f.get(field)
1092                     if f.get(field) is not None
1093                     else ('' if field == 'format_id' else -1)
1094                     for field in field_preference)
1095
1096             preference = f.get('preference')
1097             if preference is None:
1098                 preference = 0
1099                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1100                     preference -= 0.5
1101
1102             protocol = f.get('protocol') or determine_protocol(f)
1103             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1104
1105             if f.get('vcodec') == 'none':  # audio only
1106                 preference -= 50
1107                 if self._downloader.params.get('prefer_free_formats'):
1108                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1109                 else:
1110                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1111                 ext_preference = 0
1112                 try:
1113                     audio_ext_preference = ORDER.index(f['ext'])
1114                 except ValueError:
1115                     audio_ext_preference = -1
1116             else:
1117                 if f.get('acodec') == 'none':  # video only
1118                     preference -= 40
1119                 if self._downloader.params.get('prefer_free_formats'):
1120                     ORDER = ['flv', 'mp4', 'webm']
1121                 else:
1122                     ORDER = ['webm', 'flv', 'mp4']
1123                 try:
1124                     ext_preference = ORDER.index(f['ext'])
1125                 except ValueError:
1126                     ext_preference = -1
1127                 audio_ext_preference = 0
1128
1129             return (
1130                 preference,
1131                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1132                 f.get('quality') if f.get('quality') is not None else -1,
1133                 f.get('tbr') if f.get('tbr') is not None else -1,
1134                 f.get('filesize') if f.get('filesize') is not None else -1,
1135                 f.get('vbr') if f.get('vbr') is not None else -1,
1136                 f.get('height') if f.get('height') is not None else -1,
1137                 f.get('width') if f.get('width') is not None else -1,
1138                 proto_preference,
1139                 ext_preference,
1140                 f.get('abr') if f.get('abr') is not None else -1,
1141                 audio_ext_preference,
1142                 f.get('fps') if f.get('fps') is not None else -1,
1143                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1144                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1145                 f.get('format_id') if f.get('format_id') is not None else '',
1146             )
1147         formats.sort(key=_formats_key)
1148
1149     def _check_formats(self, formats, video_id):
1150         if formats:
1151             formats[:] = filter(
1152                 lambda f: self._is_valid_url(
1153                     f['url'], video_id,
1154                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1155                 formats)
1156
1157     @staticmethod
1158     def _remove_duplicate_formats(formats):
1159         format_urls = set()
1160         unique_formats = []
1161         for f in formats:
1162             if f['url'] not in format_urls:
1163                 format_urls.add(f['url'])
1164                 unique_formats.append(f)
1165         formats[:] = unique_formats
1166
1167     def _is_valid_url(self, url, video_id, item='video', headers={}):
1168         url = self._proto_relative_url(url, scheme='http:')
1169         # For now assume non HTTP(S) URLs always valid
1170         if not (url.startswith('http://') or url.startswith('https://')):
1171             return True
1172         try:
1173             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1174             return True
1175         except ExtractorError as e:
1176             if isinstance(e.cause, compat_urllib_error.URLError):
1177                 self.to_screen(
1178                     '%s: %s URL is invalid, skipping' % (video_id, item))
1179                 return False
1180             raise
1181
1182     def http_scheme(self):
1183         """ Either "http:" or "https:", depending on the user's preferences """
1184         return (
1185             'http:'
1186             if self._downloader.params.get('prefer_insecure', False)
1187             else 'https:')
1188
1189     def _proto_relative_url(self, url, scheme=None):
1190         if url is None:
1191             return url
1192         if url.startswith('//'):
1193             if scheme is None:
1194                 scheme = self.http_scheme()
1195             return scheme + url
1196         else:
1197             return url
1198
1199     def _sleep(self, timeout, video_id, msg_template=None):
1200         if msg_template is None:
1201             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1202         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1203         self.to_screen(msg)
1204         time.sleep(timeout)
1205
1206     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1207                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1208                              fatal=True, m3u8_id=None):
1209         manifest = self._download_xml(
1210             manifest_url, video_id, 'Downloading f4m manifest',
1211             'Unable to download f4m manifest',
1212             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1213             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1214             transform_source=transform_source,
1215             fatal=fatal)
1216
1217         if manifest is False:
1218             return []
1219
1220         return self._parse_f4m_formats(
1221             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1222             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1223
1224     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1225                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1226                            fatal=True, m3u8_id=None):
1227         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1228         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1229         if akamai_pv is not None and ';' in akamai_pv.text:
1230             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1231             if playerVerificationChallenge.strip() != '':
1232                 return []
1233
1234         formats = []
1235         manifest_version = '1.0'
1236         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1237         if not media_nodes:
1238             manifest_version = '2.0'
1239             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1240         # Remove unsupported DRM protected media from final formats
1241         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1242         media_nodes = remove_encrypted_media(media_nodes)
1243         if not media_nodes:
1244             return formats
1245
1246         manifest_base_url = get_base_url(manifest)
1247
1248         bootstrap_info = xpath_element(
1249             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1250             'bootstrap info', default=None)
1251
1252         vcodec = None
1253         mime_type = xpath_text(
1254             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1255             'base URL', default=None)
1256         if mime_type and mime_type.startswith('audio/'):
1257             vcodec = 'none'
1258
1259         for i, media_el in enumerate(media_nodes):
1260             tbr = int_or_none(media_el.attrib.get('bitrate'))
1261             width = int_or_none(media_el.attrib.get('width'))
1262             height = int_or_none(media_el.attrib.get('height'))
1263             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1264             # If <bootstrapInfo> is present, the specified f4m is a
1265             # stream-level manifest, and only set-level manifests may refer to
1266             # external resources.  See section 11.4 and section 4 of F4M spec
1267             if bootstrap_info is None:
1268                 media_url = None
1269                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1270                 if manifest_version == '2.0':
1271                     media_url = media_el.attrib.get('href')
1272                 if media_url is None:
1273                     media_url = media_el.attrib.get('url')
1274                 if not media_url:
1275                     continue
1276                 manifest_url = (
1277                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1278                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1279                 # If media_url is itself a f4m manifest do the recursive extraction
1280                 # since bitrates in parent manifest (this one) and media_url manifest
1281                 # may differ leading to inability to resolve the format by requested
1282                 # bitrate in f4m downloader
1283                 ext = determine_ext(manifest_url)
1284                 if ext == 'f4m':
1285                     f4m_formats = self._extract_f4m_formats(
1286                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1287                         transform_source=transform_source, fatal=fatal)
1288                     # Sometimes stream-level manifest contains single media entry that
1289                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1290                     # At the same time parent's media entry in set-level manifest may
1291                     # contain it. We will copy it from parent in such cases.
1292                     if len(f4m_formats) == 1:
1293                         f = f4m_formats[0]
1294                         f.update({
1295                             'tbr': f.get('tbr') or tbr,
1296                             'width': f.get('width') or width,
1297                             'height': f.get('height') or height,
1298                             'format_id': f.get('format_id') if not tbr else format_id,
1299                             'vcodec': vcodec,
1300                         })
1301                     formats.extend(f4m_formats)
1302                     continue
1303                 elif ext == 'm3u8':
1304                     formats.extend(self._extract_m3u8_formats(
1305                         manifest_url, video_id, 'mp4', preference=preference,
1306                         m3u8_id=m3u8_id, fatal=fatal))
1307                     continue
1308             formats.append({
1309                 'format_id': format_id,
1310                 'url': manifest_url,
1311                 'manifest_url': manifest_url,
1312                 'ext': 'flv' if bootstrap_info is not None else None,
1313                 'tbr': tbr,
1314                 'width': width,
1315                 'height': height,
1316                 'vcodec': vcodec,
1317                 'preference': preference,
1318             })
1319         return formats
1320
1321     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1322         return {
1323             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1324             'url': m3u8_url,
1325             'ext': ext,
1326             'protocol': 'm3u8',
1327             'preference': preference - 100 if preference else -100,
1328             'resolution': 'multiple',
1329             'format_note': 'Quality selection URL',
1330         }
1331
1332     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1333                               entry_protocol='m3u8', preference=None,
1334                               m3u8_id=None, note=None, errnote=None,
1335                               fatal=True, live=False):
1336         res = self._download_webpage_handle(
1337             m3u8_url, video_id,
1338             note=note or 'Downloading m3u8 information',
1339             errnote=errnote or 'Failed to download m3u8 information',
1340             fatal=fatal)
1341
1342         if res is False:
1343             return []
1344
1345         m3u8_doc, urlh = res
1346         m3u8_url = urlh.geturl()
1347
1348         return self._parse_m3u8_formats(
1349             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1350             preference=preference, m3u8_id=m3u8_id, live=live)
1351
1352     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1353                             entry_protocol='m3u8', preference=None,
1354                             m3u8_id=None, live=False):
1355         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1356             return []
1357
1358         formats = []
1359
1360         format_url = lambda u: (
1361             u
1362             if re.match(r'^https?://', u)
1363             else compat_urlparse.urljoin(m3u8_url, u))
1364
1365         # References:
1366         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1367         # 2. https://github.com/rg3/youtube-dl/issues/12211
1368
1369         # We should try extracting formats only from master playlists [1, 4.3.4],
1370         # i.e. playlists that describe available qualities. On the other hand
1371         # media playlists [1, 4.3.3] should be returned as is since they contain
1372         # just the media without qualities renditions.
1373         # Fortunately, master playlist can be easily distinguished from media
1374         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1375         # master playlist tags MUST NOT appear in a media playist and vice versa.
1376         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1377         # media playlist and MUST NOT appear in master playlist thus we can
1378         # clearly detect media playlist with this criterion.
1379
1380         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1381             return [{
1382                 'url': m3u8_url,
1383                 'format_id': m3u8_id,
1384                 'ext': ext,
1385                 'protocol': entry_protocol,
1386                 'preference': preference,
1387             }]
1388
1389         groups = {}
1390         last_stream_inf = {}
1391
1392         def extract_media(x_media_line):
1393             media = parse_m3u8_attributes(x_media_line)
1394             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1395             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1396             if not (media_type and group_id and name):
1397                 return
1398             groups.setdefault(group_id, []).append(media)
1399             if media_type not in ('VIDEO', 'AUDIO'):
1400                 return
1401             media_url = media.get('URI')
1402             if media_url:
1403                 format_id = []
1404                 for v in (m3u8_id, group_id, name):
1405                     if v:
1406                         format_id.append(v)
1407                 f = {
1408                     'format_id': '-'.join(format_id),
1409                     'url': format_url(media_url),
1410                     'manifest_url': m3u8_url,
1411                     'language': media.get('LANGUAGE'),
1412                     'ext': ext,
1413                     'protocol': entry_protocol,
1414                     'preference': preference,
1415                 }
1416                 if media_type == 'AUDIO':
1417                     f['vcodec'] = 'none'
1418                 formats.append(f)
1419
1420         def build_stream_name():
1421             # Despite specification does not mention NAME attribute for
1422             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1423             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1424             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1425             stream_name = last_stream_inf.get('NAME')
1426             if stream_name:
1427                 return stream_name
1428             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1429             # from corresponding rendition group
1430             stream_group_id = last_stream_inf.get('VIDEO')
1431             if not stream_group_id:
1432                 return
1433             stream_group = groups.get(stream_group_id)
1434             if not stream_group:
1435                 return stream_group_id
1436             rendition = stream_group[0]
1437             return rendition.get('NAME') or stream_group_id
1438
1439         for line in m3u8_doc.splitlines():
1440             if line.startswith('#EXT-X-STREAM-INF:'):
1441                 last_stream_inf = parse_m3u8_attributes(line)
1442             elif line.startswith('#EXT-X-MEDIA:'):
1443                 extract_media(line)
1444             elif line.startswith('#') or not line.strip():
1445                 continue
1446             else:
1447                 tbr = float_or_none(
1448                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1449                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1450                 format_id = []
1451                 if m3u8_id:
1452                     format_id.append(m3u8_id)
1453                 stream_name = build_stream_name()
1454                 # Bandwidth of live streams may differ over time thus making
1455                 # format_id unpredictable. So it's better to keep provided
1456                 # format_id intact.
1457                 if not live:
1458                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1459                 manifest_url = format_url(line.strip())
1460                 f = {
1461                     'format_id': '-'.join(format_id),
1462                     'url': manifest_url,
1463                     'manifest_url': m3u8_url,
1464                     'tbr': tbr,
1465                     'ext': ext,
1466                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1467                     'protocol': entry_protocol,
1468                     'preference': preference,
1469                 }
1470                 resolution = last_stream_inf.get('RESOLUTION')
1471                 if resolution:
1472                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1473                     if mobj:
1474                         f['width'] = int(mobj.group('width'))
1475                         f['height'] = int(mobj.group('height'))
1476                 # Unified Streaming Platform
1477                 mobj = re.search(
1478                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1479                 if mobj:
1480                     abr, vbr = mobj.groups()
1481                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1482                     f.update({
1483                         'vbr': vbr,
1484                         'abr': abr,
1485                     })
1486                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1487                 f.update(codecs)
1488                 audio_group_id = last_stream_inf.get('AUDIO')
1489                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1490                 # references a rendition group MUST have a CODECS attribute.
1491                 # However, this is not always respected, for example, [2]
1492                 # contains EXT-X-STREAM-INF tag which references AUDIO
1493                 # rendition group but does not have CODECS and despite
1494                 # referencing audio group an audio group, it represents
1495                 # a complete (with audio and video) format. So, for such cases
1496                 # we will ignore references to rendition groups and treat them
1497                 # as complete formats.
1498                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1499                     audio_group = groups.get(audio_group_id)
1500                     if audio_group and audio_group[0].get('URI'):
1501                         # TODO: update acodec for audio only formats with
1502                         # the same GROUP-ID
1503                         f['acodec'] = 'none'
1504                 formats.append(f)
1505                 last_stream_inf = {}
1506         return formats
1507
1508     @staticmethod
1509     def _xpath_ns(path, namespace=None):
1510         if not namespace:
1511             return path
1512         out = []
1513         for c in path.split('/'):
1514             if not c or c == '.':
1515                 out.append(c)
1516             else:
1517                 out.append('{%s}%s' % (namespace, c))
1518         return '/'.join(out)
1519
1520     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1521         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1522
1523         if smil is False:
1524             assert not fatal
1525             return []
1526
1527         namespace = self._parse_smil_namespace(smil)
1528
1529         return self._parse_smil_formats(
1530             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1531
1532     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1533         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1534         if smil is False:
1535             return {}
1536         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1537
1538     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1539         return self._download_xml(
1540             smil_url, video_id, 'Downloading SMIL file',
1541             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1542
1543     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1544         namespace = self._parse_smil_namespace(smil)
1545
1546         formats = self._parse_smil_formats(
1547             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1548         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1549
1550         video_id = os.path.splitext(url_basename(smil_url))[0]
1551         title = None
1552         description = None
1553         upload_date = None
1554         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1555             name = meta.attrib.get('name')
1556             content = meta.attrib.get('content')
1557             if not name or not content:
1558                 continue
1559             if not title and name == 'title':
1560                 title = content
1561             elif not description and name in ('description', 'abstract'):
1562                 description = content
1563             elif not upload_date and name == 'date':
1564                 upload_date = unified_strdate(content)
1565
1566         thumbnails = [{
1567             'id': image.get('type'),
1568             'url': image.get('src'),
1569             'width': int_or_none(image.get('width')),
1570             'height': int_or_none(image.get('height')),
1571         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1572
1573         return {
1574             'id': video_id,
1575             'title': title or video_id,
1576             'description': description,
1577             'upload_date': upload_date,
1578             'thumbnails': thumbnails,
1579             'formats': formats,
1580             'subtitles': subtitles,
1581         }
1582
1583     def _parse_smil_namespace(self, smil):
1584         return self._search_regex(
1585             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1586
1587     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1588         base = smil_url
1589         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1590             b = meta.get('base') or meta.get('httpBase')
1591             if b:
1592                 base = b
1593                 break
1594
1595         formats = []
1596         rtmp_count = 0
1597         http_count = 0
1598         m3u8_count = 0
1599
1600         srcs = []
1601         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1602         for medium in media:
1603             src = medium.get('src')
1604             if not src or src in srcs:
1605                 continue
1606             srcs.append(src)
1607
1608             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1609             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1610             width = int_or_none(medium.get('width'))
1611             height = int_or_none(medium.get('height'))
1612             proto = medium.get('proto')
1613             ext = medium.get('ext')
1614             src_ext = determine_ext(src)
1615             streamer = medium.get('streamer') or base
1616
1617             if proto == 'rtmp' or streamer.startswith('rtmp'):
1618                 rtmp_count += 1
1619                 formats.append({
1620                     'url': streamer,
1621                     'play_path': src,
1622                     'ext': 'flv',
1623                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1624                     'tbr': bitrate,
1625                     'filesize': filesize,
1626                     'width': width,
1627                     'height': height,
1628                 })
1629                 if transform_rtmp_url:
1630                     streamer, src = transform_rtmp_url(streamer, src)
1631                     formats[-1].update({
1632                         'url': streamer,
1633                         'play_path': src,
1634                     })
1635                 continue
1636
1637             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1638             src_url = src_url.strip()
1639
1640             if proto == 'm3u8' or src_ext == 'm3u8':
1641                 m3u8_formats = self._extract_m3u8_formats(
1642                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1643                 if len(m3u8_formats) == 1:
1644                     m3u8_count += 1
1645                     m3u8_formats[0].update({
1646                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1647                         'tbr': bitrate,
1648                         'width': width,
1649                         'height': height,
1650                     })
1651                 formats.extend(m3u8_formats)
1652                 continue
1653
1654             if src_ext == 'f4m':
1655                 f4m_url = src_url
1656                 if not f4m_params:
1657                     f4m_params = {
1658                         'hdcore': '3.2.0',
1659                         'plugin': 'flowplayer-3.2.0.1',
1660                     }
1661                 f4m_url += '&' if '?' in f4m_url else '?'
1662                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1663                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1664                 continue
1665
1666             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1667                 http_count += 1
1668                 formats.append({
1669                     'url': src_url,
1670                     'ext': ext or src_ext or 'flv',
1671                     'format_id': 'http-%d' % (bitrate or http_count),
1672                     'tbr': bitrate,
1673                     'filesize': filesize,
1674                     'width': width,
1675                     'height': height,
1676                 })
1677                 continue
1678
1679         return formats
1680
1681     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1682         urls = []
1683         subtitles = {}
1684         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1685             src = textstream.get('src')
1686             if not src or src in urls:
1687                 continue
1688             urls.append(src)
1689             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1690             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1691             subtitles.setdefault(lang, []).append({
1692                 'url': src,
1693                 'ext': ext,
1694             })
1695         return subtitles
1696
1697     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1698         xspf = self._download_xml(
1699             playlist_url, playlist_id, 'Downloading xpsf playlist',
1700             'Unable to download xspf manifest', fatal=fatal)
1701         if xspf is False:
1702             return []
1703         return self._parse_xspf(xspf, playlist_id)
1704
1705     def _parse_xspf(self, playlist, playlist_id):
1706         NS_MAP = {
1707             'xspf': 'http://xspf.org/ns/0/',
1708             's1': 'http://static.streamone.nl/player/ns/0',
1709         }
1710
1711         entries = []
1712         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1713             title = xpath_text(
1714                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1715             description = xpath_text(
1716                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1717             thumbnail = xpath_text(
1718                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1719             duration = float_or_none(
1720                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1721
1722             formats = [{
1723                 'url': location.text,
1724                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1725                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1726                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1727             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1728             self._sort_formats(formats)
1729
1730             entries.append({
1731                 'id': playlist_id,
1732                 'title': title,
1733                 'description': description,
1734                 'thumbnail': thumbnail,
1735                 'duration': duration,
1736                 'formats': formats,
1737             })
1738         return entries
1739
1740     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1741         res = self._download_webpage_handle(
1742             mpd_url, video_id,
1743             note=note or 'Downloading MPD manifest',
1744             errnote=errnote or 'Failed to download MPD manifest',
1745             fatal=fatal)
1746         if res is False:
1747             return []
1748         mpd, urlh = res
1749         mpd_base_url = base_url(urlh.geturl())
1750
1751         return self._parse_mpd_formats(
1752             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1753             formats_dict=formats_dict, mpd_url=mpd_url)
1754
1755     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1756         """
1757         Parse formats from MPD manifest.
1758         References:
1759          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1760             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1761          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1762         """
1763         if mpd_doc.get('type') == 'dynamic':
1764             return []
1765
1766         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1767
1768         def _add_ns(path):
1769             return self._xpath_ns(path, namespace)
1770
1771         def is_drm_protected(element):
1772             return element.find(_add_ns('ContentProtection')) is not None
1773
1774         def extract_multisegment_info(element, ms_parent_info):
1775             ms_info = ms_parent_info.copy()
1776
1777             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1778             # common attributes and elements.  We will only extract relevant
1779             # for us.
1780             def extract_common(source):
1781                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1782                 if segment_timeline is not None:
1783                     s_e = segment_timeline.findall(_add_ns('S'))
1784                     if s_e:
1785                         ms_info['total_number'] = 0
1786                         ms_info['s'] = []
1787                         for s in s_e:
1788                             r = int(s.get('r', 0))
1789                             ms_info['total_number'] += 1 + r
1790                             ms_info['s'].append({
1791                                 't': int(s.get('t', 0)),
1792                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1793                                 'd': int(s.attrib['d']),
1794                                 'r': r,
1795                             })
1796                 start_number = source.get('startNumber')
1797                 if start_number:
1798                     ms_info['start_number'] = int(start_number)
1799                 timescale = source.get('timescale')
1800                 if timescale:
1801                     ms_info['timescale'] = int(timescale)
1802                 segment_duration = source.get('duration')
1803                 if segment_duration:
1804                     ms_info['segment_duration'] = float(segment_duration)
1805
1806             def extract_Initialization(source):
1807                 initialization = source.find(_add_ns('Initialization'))
1808                 if initialization is not None:
1809                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1810
1811             segment_list = element.find(_add_ns('SegmentList'))
1812             if segment_list is not None:
1813                 extract_common(segment_list)
1814                 extract_Initialization(segment_list)
1815                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1816                 if segment_urls_e:
1817                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1818             else:
1819                 segment_template = element.find(_add_ns('SegmentTemplate'))
1820                 if segment_template is not None:
1821                     extract_common(segment_template)
1822                     media = segment_template.get('media')
1823                     if media:
1824                         ms_info['media'] = media
1825                     initialization = segment_template.get('initialization')
1826                     if initialization:
1827                         ms_info['initialization'] = initialization
1828                     else:
1829                         extract_Initialization(segment_template)
1830             return ms_info
1831
1832         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1833         formats = []
1834         for period in mpd_doc.findall(_add_ns('Period')):
1835             period_duration = parse_duration(period.get('duration')) or mpd_duration
1836             period_ms_info = extract_multisegment_info(period, {
1837                 'start_number': 1,
1838                 'timescale': 1,
1839             })
1840             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1841                 if is_drm_protected(adaptation_set):
1842                     continue
1843                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1844                 for representation in adaptation_set.findall(_add_ns('Representation')):
1845                     if is_drm_protected(representation):
1846                         continue
1847                     representation_attrib = adaptation_set.attrib.copy()
1848                     representation_attrib.update(representation.attrib)
1849                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1850                     mime_type = representation_attrib['mimeType']
1851                     content_type = mime_type.split('/')[0]
1852                     if content_type == 'text':
1853                         # TODO implement WebVTT downloading
1854                         pass
1855                     elif content_type in ('video', 'audio'):
1856                         base_url = ''
1857                         for element in (representation, adaptation_set, period, mpd_doc):
1858                             base_url_e = element.find(_add_ns('BaseURL'))
1859                             if base_url_e is not None:
1860                                 base_url = base_url_e.text + base_url
1861                                 if re.match(r'^https?://', base_url):
1862                                     break
1863                         if mpd_base_url and not re.match(r'^https?://', base_url):
1864                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1865                                 mpd_base_url += '/'
1866                             base_url = mpd_base_url + base_url
1867                         representation_id = representation_attrib.get('id')
1868                         lang = representation_attrib.get('lang')
1869                         url_el = representation.find(_add_ns('BaseURL'))
1870                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1871                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1872                         f = {
1873                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1874                             'url': base_url,
1875                             'manifest_url': mpd_url,
1876                             'ext': mimetype2ext(mime_type),
1877                             'width': int_or_none(representation_attrib.get('width')),
1878                             'height': int_or_none(representation_attrib.get('height')),
1879                             'tbr': float_or_none(bandwidth, 1000),
1880                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1881                             'fps': int_or_none(representation_attrib.get('frameRate')),
1882                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1883                             'format_note': 'DASH %s' % content_type,
1884                             'filesize': filesize,
1885                         }
1886                         f.update(parse_codecs(representation_attrib.get('codecs')))
1887                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1888
1889                         def prepare_template(template_name, identifiers):
1890                             t = representation_ms_info[template_name]
1891                             t = t.replace('$RepresentationID$', representation_id)
1892                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1893                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1894                             t.replace('$$', '$')
1895                             return t
1896
1897                         # @initialization is a regular template like @media one
1898                         # so it should be handled just the same way (see
1899                         # https://github.com/rg3/youtube-dl/issues/11605)
1900                         if 'initialization' in representation_ms_info:
1901                             initialization_template = prepare_template(
1902                                 'initialization',
1903                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1904                                 # $Time$ shall not be included for @initialization thus
1905                                 # only $Bandwidth$ remains
1906                                 ('Bandwidth', ))
1907                             representation_ms_info['initialization_url'] = initialization_template % {
1908                                 'Bandwidth': bandwidth,
1909                             }
1910
1911                         def location_key(location):
1912                             return 'url' if re.match(r'^https?://', location) else 'path'
1913
1914                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1915
1916                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1917                             media_location_key = location_key(media_template)
1918
1919                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1920                             # can't be used at the same time
1921                             if '%(Number' in media_template and 's' not in representation_ms_info:
1922                                 segment_duration = None
1923                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1924                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1925                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1926                                 representation_ms_info['fragments'] = [{
1927                                     media_location_key: media_template % {
1928                                         'Number': segment_number,
1929                                         'Bandwidth': bandwidth,
1930                                     },
1931                                     'duration': segment_duration,
1932                                 } for segment_number in range(
1933                                     representation_ms_info['start_number'],
1934                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1935                             else:
1936                                 # $Number*$ or $Time$ in media template with S list available
1937                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1938                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1939                                 representation_ms_info['fragments'] = []
1940                                 segment_time = 0
1941                                 segment_d = None
1942                                 segment_number = representation_ms_info['start_number']
1943
1944                                 def add_segment_url():
1945                                     segment_url = media_template % {
1946                                         'Time': segment_time,
1947                                         'Bandwidth': bandwidth,
1948                                         'Number': segment_number,
1949                                     }
1950                                     representation_ms_info['fragments'].append({
1951                                         media_location_key: segment_url,
1952                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1953                                     })
1954
1955                                 for num, s in enumerate(representation_ms_info['s']):
1956                                     segment_time = s.get('t') or segment_time
1957                                     segment_d = s['d']
1958                                     add_segment_url()
1959                                     segment_number += 1
1960                                     for r in range(s.get('r', 0)):
1961                                         segment_time += segment_d
1962                                         add_segment_url()
1963                                         segment_number += 1
1964                                     segment_time += segment_d
1965                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1966                             # No media template
1967                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1968                             # or any YouTube dashsegments video
1969                             fragments = []
1970                             segment_index = 0
1971                             timescale = representation_ms_info['timescale']
1972                             for s in representation_ms_info['s']:
1973                                 duration = float_or_none(s['d'], timescale)
1974                                 for r in range(s.get('r', 0) + 1):
1975                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1976                                     fragments.append({
1977                                         location_key(segment_uri): segment_uri,
1978                                         'duration': duration,
1979                                     })
1980                                     segment_index += 1
1981                             representation_ms_info['fragments'] = fragments
1982                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1983                         # No fragments key is present in this case.
1984                         if 'fragments' in representation_ms_info:
1985                             f.update({
1986                                 'fragment_base_url': base_url,
1987                                 'fragments': [],
1988                                 'protocol': 'http_dash_segments',
1989                             })
1990                             if 'initialization_url' in representation_ms_info:
1991                                 initialization_url = representation_ms_info['initialization_url']
1992                                 if not f.get('url'):
1993                                     f['url'] = initialization_url
1994                                 f['fragments'].append({location_key(initialization_url): initialization_url})
1995                             f['fragments'].extend(representation_ms_info['fragments'])
1996                         try:
1997                             existing_format = next(
1998                                 fo for fo in formats
1999                                 if fo['format_id'] == representation_id)
2000                         except StopIteration:
2001                             full_info = formats_dict.get(representation_id, {}).copy()
2002                             full_info.update(f)
2003                             formats.append(full_info)
2004                         else:
2005                             existing_format.update(f)
2006                     else:
2007                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2008         return formats
2009
2010     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2011         res = self._download_webpage_handle(
2012             ism_url, video_id,
2013             note=note or 'Downloading ISM manifest',
2014             errnote=errnote or 'Failed to download ISM manifest',
2015             fatal=fatal)
2016         if res is False:
2017             return []
2018         ism, urlh = res
2019
2020         return self._parse_ism_formats(
2021             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2022
2023     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2024         """
2025         Parse formats from ISM manifest.
2026         References:
2027          1. [MS-SSTR]: Smooth Streaming Protocol,
2028             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2029         """
2030         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2031             return []
2032
2033         duration = int(ism_doc.attrib['Duration'])
2034         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2035
2036         formats = []
2037         for stream in ism_doc.findall('StreamIndex'):
2038             stream_type = stream.get('Type')
2039             if stream_type not in ('video', 'audio'):
2040                 continue
2041             url_pattern = stream.attrib['Url']
2042             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2043             stream_name = stream.get('Name')
2044             for track in stream.findall('QualityLevel'):
2045                 fourcc = track.get('FourCC')
2046                 # TODO: add support for WVC1 and WMAP
2047                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2048                     self.report_warning('%s is not a supported codec' % fourcc)
2049                     continue
2050                 tbr = int(track.attrib['Bitrate']) // 1000
2051                 # [1] does not mention Width and Height attributes. However,
2052                 # they're often present while MaxWidth and MaxHeight are
2053                 # missing, so should be used as fallbacks
2054                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2055                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2056                 sampling_rate = int_or_none(track.get('SamplingRate'))
2057
2058                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2059                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2060
2061                 fragments = []
2062                 fragment_ctx = {
2063                     'time': 0,
2064                 }
2065                 stream_fragments = stream.findall('c')
2066                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2067                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2068                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2069                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2070                     if not fragment_ctx['duration']:
2071                         try:
2072                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2073                         except IndexError:
2074                             next_fragment_time = duration
2075                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2076                     for _ in range(fragment_repeat):
2077                         fragments.append({
2078                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2079                             'duration': fragment_ctx['duration'] / stream_timescale,
2080                         })
2081                         fragment_ctx['time'] += fragment_ctx['duration']
2082
2083                 format_id = []
2084                 if ism_id:
2085                     format_id.append(ism_id)
2086                 if stream_name:
2087                     format_id.append(stream_name)
2088                 format_id.append(compat_str(tbr))
2089
2090                 formats.append({
2091                     'format_id': '-'.join(format_id),
2092                     'url': ism_url,
2093                     'manifest_url': ism_url,
2094                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2095                     'width': width,
2096                     'height': height,
2097                     'tbr': tbr,
2098                     'asr': sampling_rate,
2099                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2100                     'acodec': 'none' if stream_type == 'video' else fourcc,
2101                     'protocol': 'ism',
2102                     'fragments': fragments,
2103                     '_download_params': {
2104                         'duration': duration,
2105                         'timescale': stream_timescale,
2106                         'width': width or 0,
2107                         'height': height or 0,
2108                         'fourcc': fourcc,
2109                         'codec_private_data': track.get('CodecPrivateData'),
2110                         'sampling_rate': sampling_rate,
2111                         'channels': int_or_none(track.get('Channels', 2)),
2112                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2113                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2114                     },
2115                 })
2116         return formats
2117
2118     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2119         def absolute_url(video_url):
2120             return compat_urlparse.urljoin(base_url, video_url)
2121
2122         def parse_content_type(content_type):
2123             if not content_type:
2124                 return {}
2125             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2126             if ctr:
2127                 mimetype, codecs = ctr.groups()
2128                 f = parse_codecs(codecs)
2129                 f['ext'] = mimetype2ext(mimetype)
2130                 return f
2131             return {}
2132
2133         def _media_formats(src, cur_media_type, type_info={}):
2134             full_url = absolute_url(src)
2135             ext = type_info.get('ext') or determine_ext(full_url)
2136             if ext == 'm3u8':
2137                 is_plain_url = False
2138                 formats = self._extract_m3u8_formats(
2139                     full_url, video_id, ext='mp4',
2140                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2141                     preference=preference, fatal=False)
2142             elif ext == 'mpd':
2143                 is_plain_url = False
2144                 formats = self._extract_mpd_formats(
2145                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2146             else:
2147                 is_plain_url = True
2148                 formats = [{
2149                     'url': full_url,
2150                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2151                 }]
2152             return is_plain_url, formats
2153
2154         entries = []
2155         # amp-video and amp-audio are very similar to their HTML5 counterparts
2156         # so we wll include them right here (see
2157         # https://www.ampproject.org/docs/reference/components/amp-video)
2158         media_tags = [(media_tag, media_type, '')
2159                       for media_tag, media_type
2160                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2161         media_tags.extend(re.findall(
2162             # We only allow video|audio followed by a whitespace or '>'.
2163             # Allowing more characters may end up in significant slow down (see
2164             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2165             # http://www.porntrex.com/maps/videositemap.xml).
2166             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2167         for media_tag, media_type, media_content in media_tags:
2168             media_info = {
2169                 'formats': [],
2170                 'subtitles': {},
2171             }
2172             media_attributes = extract_attributes(media_tag)
2173             src = media_attributes.get('src')
2174             if src:
2175                 _, formats = _media_formats(src, media_type)
2176                 media_info['formats'].extend(formats)
2177             media_info['thumbnail'] = media_attributes.get('poster')
2178             if media_content:
2179                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2180                     source_attributes = extract_attributes(source_tag)
2181                     src = source_attributes.get('src')
2182                     if not src:
2183                         continue
2184                     f = parse_content_type(source_attributes.get('type'))
2185                     is_plain_url, formats = _media_formats(src, media_type, f)
2186                     if is_plain_url:
2187                         # res attribute is not standard but seen several times
2188                         # in the wild
2189                         f.update({
2190                             'height': int_or_none(source_attributes.get('res')),
2191                             'format_id': source_attributes.get('label'),
2192                         })
2193                         f.update(formats[0])
2194                         media_info['formats'].append(f)
2195                     else:
2196                         media_info['formats'].extend(formats)
2197                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2198                     track_attributes = extract_attributes(track_tag)
2199                     kind = track_attributes.get('kind')
2200                     if not kind or kind in ('subtitles', 'captions'):
2201                         src = track_attributes.get('src')
2202                         if not src:
2203                             continue
2204                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2205                         media_info['subtitles'].setdefault(lang, []).append({
2206                             'url': absolute_url(src),
2207                         })
2208             if media_info['formats'] or media_info['subtitles']:
2209                 entries.append(media_info)
2210         return entries
2211
2212     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2213         formats = []
2214         hdcore_sign = 'hdcore=3.7.0'
2215         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2216         hds_host = hosts.get('hds')
2217         if hds_host:
2218             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2219         if 'hdcore=' not in f4m_url:
2220             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2221         f4m_formats = self._extract_f4m_formats(
2222             f4m_url, video_id, f4m_id='hds', fatal=False)
2223         for entry in f4m_formats:
2224             entry.update({'extra_param_to_segment_url': hdcore_sign})
2225         formats.extend(f4m_formats)
2226         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2227         hls_host = hosts.get('hls')
2228         if hls_host:
2229             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2230         formats.extend(self._extract_m3u8_formats(
2231             m3u8_url, video_id, 'mp4', 'm3u8_native',
2232             m3u8_id='hls', fatal=False))
2233         return formats
2234
2235     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2236         query = compat_urlparse.urlparse(url).query
2237         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2238         url_base = self._search_regex(
2239             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2240         http_base_url = '%s:%s' % ('http', url_base)
2241         formats = []
2242
2243         def manifest_url(manifest):
2244             m_url = '%s/%s' % (http_base_url, manifest)
2245             if query:
2246                 m_url += '?%s' % query
2247             return m_url
2248
2249         if 'm3u8' not in skip_protocols:
2250             formats.extend(self._extract_m3u8_formats(
2251                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2252                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2253         if 'f4m' not in skip_protocols:
2254             formats.extend(self._extract_f4m_formats(
2255                 manifest_url('manifest.f4m'),
2256                 video_id, f4m_id='hds', fatal=False))
2257         if 'dash' not in skip_protocols:
2258             formats.extend(self._extract_mpd_formats(
2259                 manifest_url('manifest.mpd'),
2260                 video_id, mpd_id='dash', fatal=False))
2261         if re.search(r'(?:/smil:|\.smil)', url_base):
2262             if 'smil' not in skip_protocols:
2263                 rtmp_formats = self._extract_smil_formats(
2264                     manifest_url('jwplayer.smil'),
2265                     video_id, fatal=False)
2266                 for rtmp_format in rtmp_formats:
2267                     rtsp_format = rtmp_format.copy()
2268                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2269                     del rtsp_format['play_path']
2270                     del rtsp_format['ext']
2271                     rtsp_format.update({
2272                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2273                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2274                         'protocol': 'rtsp',
2275                     })
2276                     formats.extend([rtmp_format, rtsp_format])
2277         else:
2278             for protocol in ('rtmp', 'rtsp'):
2279                 if protocol not in skip_protocols:
2280                     formats.append({
2281                         'url': '%s:%s' % (protocol, url_base),
2282                         'format_id': protocol,
2283                         'protocol': protocol,
2284                     })
2285         return formats
2286
2287     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2288         mobj = re.search(
2289             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2290             webpage)
2291         if mobj:
2292             try:
2293                 jwplayer_data = self._parse_json(mobj.group('options'),
2294                                                  video_id=video_id,
2295                                                  transform_source=transform_source)
2296             except ExtractorError:
2297                 pass
2298             else:
2299                 if isinstance(jwplayer_data, dict):
2300                     return jwplayer_data
2301
2302     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2303         jwplayer_data = self._find_jwplayer_data(
2304             webpage, video_id, transform_source=js_to_json)
2305         return self._parse_jwplayer_data(
2306             jwplayer_data, video_id, *args, **kwargs)
2307
2308     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2309                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2310         # JWPlayer backward compatibility: flattened playlists
2311         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2312         if 'playlist' not in jwplayer_data:
2313             jwplayer_data = {'playlist': [jwplayer_data]}
2314
2315         entries = []
2316
2317         # JWPlayer backward compatibility: single playlist item
2318         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2319         if not isinstance(jwplayer_data['playlist'], list):
2320             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2321
2322         for video_data in jwplayer_data['playlist']:
2323             # JWPlayer backward compatibility: flattened sources
2324             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2325             if 'sources' not in video_data:
2326                 video_data['sources'] = [video_data]
2327
2328             this_video_id = video_id or video_data['mediaid']
2329
2330             formats = self._parse_jwplayer_formats(
2331                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2332                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2333
2334             subtitles = {}
2335             tracks = video_data.get('tracks')
2336             if tracks and isinstance(tracks, list):
2337                 for track in tracks:
2338                     if not isinstance(track, dict):
2339                         continue
2340                     if track.get('kind') != 'captions':
2341                         continue
2342                     track_url = urljoin(base_url, track.get('file'))
2343                     if not track_url:
2344                         continue
2345                     subtitles.setdefault(track.get('label') or 'en', []).append({
2346                         'url': self._proto_relative_url(track_url)
2347                     })
2348
2349             entry = {
2350                 'id': this_video_id,
2351                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2352                 'description': video_data.get('description'),
2353                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2354                 'timestamp': int_or_none(video_data.get('pubdate')),
2355                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2356                 'subtitles': subtitles,
2357             }
2358             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2359             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2360                 entry.update({
2361                     '_type': 'url_transparent',
2362                     'url': formats[0]['url'],
2363                 })
2364             else:
2365                 self._sort_formats(formats)
2366                 entry['formats'] = formats
2367             entries.append(entry)
2368         if len(entries) == 1:
2369             return entries[0]
2370         else:
2371             return self.playlist_result(entries)
2372
2373     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2374                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2375         urls = []
2376         formats = []
2377         for source in jwplayer_sources_data:
2378             if not isinstance(source, dict):
2379                 continue
2380             source_url = self._proto_relative_url(source.get('file'))
2381             if not source_url:
2382                 continue
2383             if base_url:
2384                 source_url = compat_urlparse.urljoin(base_url, source_url)
2385             if source_url in urls:
2386                 continue
2387             urls.append(source_url)
2388             source_type = source.get('type') or ''
2389             ext = mimetype2ext(source_type) or determine_ext(source_url)
2390             if source_type == 'hls' or ext == 'm3u8':
2391                 formats.extend(self._extract_m3u8_formats(
2392                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2393                     m3u8_id=m3u8_id, fatal=False))
2394             elif ext == 'mpd':
2395                 formats.extend(self._extract_mpd_formats(
2396                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2397             elif ext == 'smil':
2398                 formats.extend(self._extract_smil_formats(
2399                     source_url, video_id, fatal=False))
2400             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2401             elif source_type.startswith('audio') or ext in (
2402                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2403                 formats.append({
2404                     'url': source_url,
2405                     'vcodec': 'none',
2406                     'ext': ext,
2407                 })
2408             else:
2409                 height = int_or_none(source.get('height'))
2410                 if height is None:
2411                     # Often no height is provided but there is a label in
2412                     # format like "1080p", "720p SD", or 1080.
2413                     height = int_or_none(self._search_regex(
2414                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2415                         'height', default=None))
2416                 a_format = {
2417                     'url': source_url,
2418                     'width': int_or_none(source.get('width')),
2419                     'height': height,
2420                     'tbr': int_or_none(source.get('bitrate')),
2421                     'ext': ext,
2422                 }
2423                 if source_url.startswith('rtmp'):
2424                     a_format['ext'] = 'flv'
2425                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2426                     # of jwplayer.flash.swf
2427                     rtmp_url_parts = re.split(
2428                         r'((?:mp4|mp3|flv):)', source_url, 1)
2429                     if len(rtmp_url_parts) == 3:
2430                         rtmp_url, prefix, play_path = rtmp_url_parts
2431                         a_format.update({
2432                             'url': rtmp_url,
2433                             'play_path': prefix + play_path,
2434                         })
2435                     if rtmp_params:
2436                         a_format.update(rtmp_params)
2437                 formats.append(a_format)
2438         return formats
2439
2440     def _live_title(self, name):
2441         """ Generate the title for a live video """
2442         now = datetime.datetime.now()
2443         now_str = now.strftime('%Y-%m-%d %H:%M')
2444         return name + ' ' + now_str
2445
2446     def _int(self, v, name, fatal=False, **kwargs):
2447         res = int_or_none(v, **kwargs)
2448         if 'get_attr' in kwargs:
2449             print(getattr(v, kwargs['get_attr']))
2450         if res is None:
2451             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2452             if fatal:
2453                 raise ExtractorError(msg)
2454             else:
2455                 self._downloader.report_warning(msg)
2456         return res
2457
2458     def _float(self, v, name, fatal=False, **kwargs):
2459         res = float_or_none(v, **kwargs)
2460         if res is None:
2461             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2462             if fatal:
2463                 raise ExtractorError(msg)
2464             else:
2465                 self._downloader.report_warning(msg)
2466         return res
2467
2468     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2469                     path='/', secure=False, discard=False, rest={}, **kwargs):
2470         cookie = compat_cookiejar.Cookie(
2471             0, name, value, port, port is not None, domain, True,
2472             domain.startswith('.'), path, True, secure, expire_time,
2473             discard, None, None, rest)
2474         self._downloader.cookiejar.set_cookie(cookie)
2475
2476     def _get_cookies(self, url):
2477         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2478         req = sanitized_Request(url)
2479         self._downloader.cookiejar.add_cookie_header(req)
2480         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2481
2482     def get_testcases(self, include_onlymatching=False):
2483         t = getattr(self, '_TEST', None)
2484         if t:
2485             assert not hasattr(self, '_TESTS'), \
2486                 '%s has _TEST and _TESTS' % type(self).__name__
2487             tests = [t]
2488         else:
2489             tests = getattr(self, '_TESTS', [])
2490         for t in tests:
2491             if not include_onlymatching and t.get('only_matching', False):
2492                 continue
2493             t['name'] = type(self).__name__[:-len('IE')]
2494             yield t
2495
2496     def is_suitable(self, age_limit):
2497         """ Test whether the extractor is generally suitable for the given
2498         age limit (i.e. pornographic sites are not, all others usually are) """
2499
2500         any_restricted = False
2501         for tc in self.get_testcases(include_onlymatching=False):
2502             if tc.get('playlist', []):
2503                 tc = tc['playlist'][0]
2504             is_restricted = age_restricted(
2505                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2506             if not is_restricted:
2507                 return True
2508             any_restricted = any_restricted or is_restricted
2509         return not any_restricted
2510
2511     def extract_subtitles(self, *args, **kwargs):
2512         if (self._downloader.params.get('writesubtitles', False) or
2513                 self._downloader.params.get('listsubtitles')):
2514             return self._get_subtitles(*args, **kwargs)
2515         return {}
2516
2517     def _get_subtitles(self, *args, **kwargs):
2518         raise NotImplementedError('This method must be implemented by subclasses')
2519
2520     @staticmethod
2521     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2522         """ Merge subtitle items for one language. Items with duplicated URLs
2523         will be dropped. """
2524         list1_urls = set([item['url'] for item in subtitle_list1])
2525         ret = list(subtitle_list1)
2526         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2527         return ret
2528
2529     @classmethod
2530     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2531         """ Merge two subtitle dictionaries, language by language. """
2532         ret = dict(subtitle_dict1)
2533         for lang in subtitle_dict2:
2534             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2535         return ret
2536
2537     def extract_automatic_captions(self, *args, **kwargs):
2538         if (self._downloader.params.get('writeautomaticsub', False) or
2539                 self._downloader.params.get('listsubtitles')):
2540             return self._get_automatic_captions(*args, **kwargs)
2541         return {}
2542
2543     def _get_automatic_captions(self, *args, **kwargs):
2544         raise NotImplementedError('This method must be implemented by subclasses')
2545
2546     def mark_watched(self, *args, **kwargs):
2547         if (self._downloader.params.get('mark_watched', False) and
2548                 (self._get_login_info()[0] is not None or
2549                     self._downloader.params.get('cookiefile') is not None)):
2550             self._mark_watched(*args, **kwargs)
2551
2552     def _mark_watched(self, *args, **kwargs):
2553         raise NotImplementedError('This method must be implemented by subclasses')
2554
2555     def geo_verification_headers(self):
2556         headers = {}
2557         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2558         if geo_verification_proxy:
2559             headers['Ytdl-request-proxy'] = geo_verification_proxy
2560         return headers
2561
2562     def _generic_id(self, url):
2563         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2564
2565     def _generic_title(self, url):
2566         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2567
2568
2569 class SearchInfoExtractor(InfoExtractor):
2570     """
2571     Base class for paged search queries extractors.
2572     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2573     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2574     """
2575
2576     @classmethod
2577     def _make_valid_url(cls):
2578         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2579
2580     @classmethod
2581     def suitable(cls, url):
2582         return re.match(cls._make_valid_url(), url) is not None
2583
2584     def _real_extract(self, query):
2585         mobj = re.match(self._make_valid_url(), query)
2586         if mobj is None:
2587             raise ExtractorError('Invalid search query "%s"' % query)
2588
2589         prefix = mobj.group('prefix')
2590         query = mobj.group('query')
2591         if prefix == '':
2592             return self._get_n_results(query, 1)
2593         elif prefix == 'all':
2594             return self._get_n_results(query, self._MAX_RESULTS)
2595         else:
2596             n = int(prefix)
2597             if n <= 0:
2598                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2599             elif n > self._MAX_RESULTS:
2600                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2601                 n = self._MAX_RESULTS
2602             return self._get_n_results(query, n)
2603
2604     def _get_n_results(self, query, n):
2605         """Get a specified number of results for a query"""
2606         raise NotImplementedError('This method must be implemented by subclasses')
2607
2608     @property
2609     def SEARCH_KEY(self):
2610         return self._SEARCH_KEY