[extractor/common] Move censorship checks to a separate method and add check for...
[youtube-dl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import random
10 import re
11 import socket
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17     compat_cookiejar,
18     compat_cookies,
19     compat_etree_fromstring,
20     compat_getpass,
21     compat_http_client,
22     compat_os_name,
23     compat_str,
24     compat_urllib_error,
25     compat_urllib_parse_unquote,
26     compat_urllib_parse_urlencode,
27     compat_urllib_request,
28     compat_urlparse,
29 )
30 from ..downloader.f4m import remove_encrypted_media
31 from ..utils import (
32     NO_DEFAULT,
33     age_restricted,
34     base_url,
35     bug_reports_message,
36     clean_html,
37     compiled_regex_type,
38     determine_ext,
39     determine_protocol,
40     error_to_compat_str,
41     ExtractorError,
42     extract_attributes,
43     fix_xml_ampersands,
44     float_or_none,
45     GeoRestrictedError,
46     GeoUtils,
47     int_or_none,
48     js_to_json,
49     mimetype2ext,
50     orderedSet,
51     parse_codecs,
52     parse_duration,
53     parse_iso8601,
54     parse_m3u8_attributes,
55     RegexNotFoundError,
56     sanitized_Request,
57     sanitize_filename,
58     unescapeHTML,
59     unified_strdate,
60     unified_timestamp,
61     update_Request,
62     update_url_query,
63     urljoin,
64     url_basename,
65     xpath_element,
66     xpath_text,
67     xpath_with_ns,
68 )
69
70
71 class InfoExtractor(object):
72     """Information Extractor class.
73
74     Information extractors are the classes that, given a URL, extract
75     information about the video (or videos) the URL refers to. This
76     information includes the real video URL, the video title, author and
77     others. The information is stored in a dictionary which is then
78     passed to the YoutubeDL. The YoutubeDL processes this
79     information possibly downloading the video to the file system, among
80     other possible outcomes.
81
82     The type field determines the type of the result.
83     By far the most common value (and the default if _type is missing) is
84     "video", which indicates a single video.
85
86     For a video, the dictionaries must include the following fields:
87
88     id:             Video identifier.
89     title:          Video title, unescaped.
90
91     Additionally, it must contain either a formats entry or a url one:
92
93     formats:        A list of dictionaries for each format available, ordered
94                     from worst to best quality.
95
96                     Potential fields:
97                     * url        Mandatory. The URL of the video file
98                     * manifest_url
99                                  The URL of the manifest file in case of
100                                  fragmented media (DASH, hls, hds)
101                     * ext        Will be calculated from URL if missing
102                     * format     A human-readable description of the format
103                                  ("mp4 container with h264/opus").
104                                  Calculated from the format_id, width, height.
105                                  and format_note fields if missing.
106                     * format_id  A short description of the format
107                                  ("mp4_h264_opus" or "19").
108                                 Technically optional, but strongly recommended.
109                     * format_note Additional info about the format
110                                  ("3D" or "DASH video")
111                     * width      Width of the video, if known
112                     * height     Height of the video, if known
113                     * resolution Textual description of width and height
114                     * tbr        Average bitrate of audio and video in KBit/s
115                     * abr        Average audio bitrate in KBit/s
116                     * acodec     Name of the audio codec in use
117                     * asr        Audio sampling rate in Hertz
118                     * vbr        Average video bitrate in KBit/s
119                     * fps        Frame rate
120                     * vcodec     Name of the video codec in use
121                     * container  Name of the container format
122                     * filesize   The number of bytes, if known in advance
123                     * filesize_approx  An estimate for the number of bytes
124                     * player_url SWF Player URL (used for rtmpdump).
125                     * protocol   The protocol that will be used for the actual
126                                  download, lower-case.
127                                  "http", "https", "rtsp", "rtmp", "rtmpe",
128                                  "m3u8", "m3u8_native" or "http_dash_segments".
129                     * fragment_base_url
130                                  Base URL for fragments. Each fragment's path
131                                  value (if present) will be relative to
132                                  this URL.
133                     * fragments  A list of fragments of a fragmented media.
134                                  Each fragment entry must contain either an url
135                                  or a path. If an url is present it should be
136                                  considered by a client. Otherwise both path and
137                                  fragment_base_url must be present. Here is
138                                  the list of all potential fields:
139                                  * "url" - fragment's URL
140                                  * "path" - fragment's path relative to
141                                             fragment_base_url
142                                  * "duration" (optional, int or float)
143                                  * "filesize" (optional, int)
144                     * preference Order number of this format. If this field is
145                                  present and not None, the formats get sorted
146                                  by this field, regardless of all other values.
147                                  -1 for default (order by other properties),
148                                  -2 or smaller for less than default.
149                                  < -1000 to hide the format (if there is
150                                     another one which is strictly better)
151                     * language   Language code, e.g. "de" or "en-US".
152                     * language_preference  Is this in the language mentioned in
153                                  the URL?
154                                  10 if it's what the URL is about,
155                                  -1 for default (don't know),
156                                  -10 otherwise, other values reserved for now.
157                     * quality    Order number of the video quality of this
158                                  format, irrespective of the file format.
159                                  -1 for default (order by other properties),
160                                  -2 or smaller for less than default.
161                     * source_preference  Order number for this video source
162                                   (quality takes higher priority)
163                                  -1 for default (order by other properties),
164                                  -2 or smaller for less than default.
165                     * http_headers  A dictionary of additional HTTP headers
166                                  to add to the request.
167                     * stretched_ratio  If given and not 1, indicates that the
168                                  video's pixels are not square.
169                                  width : height ratio as float.
170                     * no_resume  The server does not support resuming the
171                                  (HTTP or RTMP) download. Boolean.
172
173     url:            Final video URL.
174     ext:            Video filename extension.
175     format:         The video format, defaults to ext (used for --get-format)
176     player_url:     SWF Player URL (used for rtmpdump).
177
178     The following fields are optional:
179
180     alt_title:      A secondary title of the video.
181     display_id      An alternative identifier for the video, not necessarily
182                     unique, but available before title. Typically, id is
183                     something like "4234987", title "Dancing naked mole rats",
184                     and display_id "dancing-naked-mole-rats"
185     thumbnails:     A list of dictionaries, with the following entries:
186                         * "id" (optional, string) - Thumbnail format ID
187                         * "url"
188                         * "preference" (optional, int) - quality of the image
189                         * "width" (optional, int)
190                         * "height" (optional, int)
191                         * "resolution" (optional, string "{width}x{height"},
192                                         deprecated)
193                         * "filesize" (optional, int)
194     thumbnail:      Full URL to a video thumbnail image.
195     description:    Full video description.
196     uploader:       Full name of the video uploader.
197     license:        License name the video is licensed under.
198     creator:        The creator of the video.
199     release_date:   The date (YYYYMMDD) when the video was released.
200     timestamp:      UNIX timestamp of the moment the video became available.
201     upload_date:    Video upload date (YYYYMMDD).
202                     If not explicitly set, calculated from timestamp.
203     uploader_id:    Nickname or id of the video uploader.
204     uploader_url:   Full URL to a personal webpage of the video uploader.
205     location:       Physical location where the video was filmed.
206     subtitles:      The available subtitles as a dictionary in the format
207                     {tag: subformats}. "tag" is usually a language code, and
208                     "subformats" is a list sorted from lower to higher
209                     preference, each element is a dictionary with the "ext"
210                     entry and one of:
211                         * "data": The subtitles file contents
212                         * "url": A URL pointing to the subtitles file
213                     "ext" will be calculated from URL if missing
214     automatic_captions: Like 'subtitles', used by the YoutubeIE for
215                     automatically generated captions
216     duration:       Length of the video in seconds, as an integer or float.
217     view_count:     How many users have watched the video on the platform.
218     like_count:     Number of positive ratings of the video
219     dislike_count:  Number of negative ratings of the video
220     repost_count:   Number of reposts of the video
221     average_rating: Average rating give by users, the scale used depends on the webpage
222     comment_count:  Number of comments on the video
223     comments:       A list of comments, each with one or more of the following
224                     properties (all but one of text or html optional):
225                         * "author" - human-readable name of the comment author
226                         * "author_id" - user ID of the comment author
227                         * "id" - Comment ID
228                         * "html" - Comment as HTML
229                         * "text" - Plain text of the comment
230                         * "timestamp" - UNIX timestamp of comment
231                         * "parent" - ID of the comment this one is replying to.
232                                      Set to "root" to indicate that this is a
233                                      comment to the original video.
234     age_limit:      Age restriction for the video, as an integer (years)
235     webpage_url:    The URL to the video webpage, if given to youtube-dl it
236                     should allow to get the same result again. (It will be set
237                     by YoutubeDL if it's missing)
238     categories:     A list of categories that the video falls in, for example
239                     ["Sports", "Berlin"]
240     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
241     is_live:        True, False, or None (=unknown). Whether this video is a
242                     live stream that goes on instead of a fixed-length video.
243     start_time:     Time in seconds where the reproduction should start, as
244                     specified in the URL.
245     end_time:       Time in seconds where the reproduction should end, as
246                     specified in the URL.
247
248     The following fields should only be used when the video belongs to some logical
249     chapter or section:
250
251     chapter:        Name or title of the chapter the video belongs to.
252     chapter_number: Number of the chapter the video belongs to, as an integer.
253     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
254
255     The following fields should only be used when the video is an episode of some
256     series, programme or podcast:
257
258     series:         Title of the series or programme the video episode belongs to.
259     season:         Title of the season the video episode belongs to.
260     season_number:  Number of the season the video episode belongs to, as an integer.
261     season_id:      Id of the season the video episode belongs to, as a unicode string.
262     episode:        Title of the video episode. Unlike mandatory video title field,
263                     this field should denote the exact title of the video episode
264                     without any kind of decoration.
265     episode_number: Number of the video episode within a season, as an integer.
266     episode_id:     Id of the video episode, as a unicode string.
267
268     The following fields should only be used when the media is a track or a part of
269     a music album:
270
271     track:          Title of the track.
272     track_number:   Number of the track within an album or a disc, as an integer.
273     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
274                     as a unicode string.
275     artist:         Artist(s) of the track.
276     genre:          Genre(s) of the track.
277     album:          Title of the album the track belongs to.
278     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
279     album_artist:   List of all artists appeared on the album (e.g.
280                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281                     and compilations).
282     disc_number:    Number of the disc or other physical medium the track belongs to,
283                     as an integer.
284     release_year:   Year (YYYY) when the album was released.
285
286     Unless mentioned otherwise, the fields should be Unicode strings.
287
288     Unless mentioned otherwise, None is equivalent to absence of information.
289
290
291     _type "playlist" indicates multiple videos.
292     There must be a key "entries", which is a list, an iterable, or a PagedList
293     object, each element of which is a valid dictionary by this specification.
294
295     Additionally, playlists can have "title", "description" and "id" attributes
296     with the same semantics as videos (see above).
297
298
299     _type "multi_video" indicates that there are multiple videos that
300     form a single show, for examples multiple acts of an opera or TV episode.
301     It must have an entries key like a playlist and contain all the keys
302     required for a video at the same time.
303
304
305     _type "url" indicates that the video must be extracted from another
306     location, possibly by a different extractor. Its only required key is:
307     "url" - the next URL to extract.
308     The key "ie_key" can be set to the class name (minus the trailing "IE",
309     e.g. "Youtube") if the extractor class is known in advance.
310     Additionally, the dictionary may have any properties of the resolved entity
311     known in advance, for example "title" if the title of the referred video is
312     known ahead of time.
313
314
315     _type "url_transparent" entities have the same specification as "url", but
316     indicate that the given additional information is more precise than the one
317     associated with the resolved URL.
318     This is useful when a site employs a video service that hosts the video and
319     its technical metadata, but that video service does not embed a useful
320     title, description etc.
321
322
323     Subclasses of this one should re-define the _real_initialize() and
324     _real_extract() methods and define a _VALID_URL regexp.
325     Probably, they should also be added to the list of extractors.
326
327     _GEO_BYPASS attribute may be set to False in order to disable
328     geo restriction bypass mechanisms for a particular extractor.
329     Though it won't disable explicit geo restriction bypass based on
330     country code provided with geo_bypass_country. (experimental)
331
332     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
333     countries for this extractor. One of these countries will be used by
334     geo restriction bypass mechanism right away in order to bypass
335     geo restriction, of course, if the mechanism is not disabled. (experimental)
336
337     NB: both these geo attributes are experimental and may change in future
338     or be completely removed.
339
340     Finally, the _WORKING attribute should be set to False for broken IEs
341     in order to warn the users and skip the tests.
342     """
343
344     _ready = False
345     _downloader = None
346     _x_forwarded_for_ip = None
347     _GEO_BYPASS = True
348     _GEO_COUNTRIES = None
349     _WORKING = True
350
351     def __init__(self, downloader=None):
352         """Constructor. Receives an optional downloader."""
353         self._ready = False
354         self._x_forwarded_for_ip = None
355         self.set_downloader(downloader)
356
357     @classmethod
358     def suitable(cls, url):
359         """Receives a URL and returns True if suitable for this IE."""
360
361         # This does not use has/getattr intentionally - we want to know whether
362         # we have cached the regexp for *this* class, whereas getattr would also
363         # match the superclass
364         if '_VALID_URL_RE' not in cls.__dict__:
365             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
366         return cls._VALID_URL_RE.match(url) is not None
367
368     @classmethod
369     def _match_id(cls, url):
370         if '_VALID_URL_RE' not in cls.__dict__:
371             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
372         m = cls._VALID_URL_RE.match(url)
373         assert m
374         return m.group('id')
375
376     @classmethod
377     def working(cls):
378         """Getter method for _WORKING."""
379         return cls._WORKING
380
381     def initialize(self):
382         """Initializes an instance (authentication, etc)."""
383         self._initialize_geo_bypass(self._GEO_COUNTRIES)
384         if not self._ready:
385             self._real_initialize()
386             self._ready = True
387
388     def _initialize_geo_bypass(self, countries):
389         """
390         Initialize geo restriction bypass mechanism.
391
392         This method is used to initialize geo bypass mechanism based on faking
393         X-Forwarded-For HTTP header. A random country from provided country list
394         is selected and a random IP belonging to this country is generated. This
395         IP will be passed as X-Forwarded-For HTTP header in all subsequent
396         HTTP requests.
397
398         This method will be used for initial geo bypass mechanism initialization
399         during the instance initialization with _GEO_COUNTRIES.
400
401         You may also manually call it from extractor's code if geo countries
402         information is not available beforehand (e.g. obtained during
403         extraction) or due to some another reason.
404         """
405         if not self._x_forwarded_for_ip:
406             country_code = self._downloader.params.get('geo_bypass_country', None)
407             # If there is no explicit country for geo bypass specified and
408             # the extractor is known to be geo restricted let's fake IP
409             # as X-Forwarded-For right away.
410             if (not country_code and
411                     self._GEO_BYPASS and
412                     self._downloader.params.get('geo_bypass', True) and
413                     countries):
414                 country_code = random.choice(countries)
415             if country_code:
416                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
417                 if self._downloader.params.get('verbose', False):
418                     self._downloader.to_stdout(
419                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
420                         % (self._x_forwarded_for_ip, country_code.upper()))
421
422     def extract(self, url):
423         """Extracts URL information and returns it in list of dicts."""
424         try:
425             for _ in range(2):
426                 try:
427                     self.initialize()
428                     ie_result = self._real_extract(url)
429                     if self._x_forwarded_for_ip:
430                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
431                     return ie_result
432                 except GeoRestrictedError as e:
433                     if self.__maybe_fake_ip_and_retry(e.countries):
434                         continue
435                     raise
436         except ExtractorError:
437             raise
438         except compat_http_client.IncompleteRead as e:
439             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
440         except (KeyError, StopIteration) as e:
441             raise ExtractorError('An extractor error has occurred.', cause=e)
442
443     def __maybe_fake_ip_and_retry(self, countries):
444         if (not self._downloader.params.get('geo_bypass_country', None) and
445                 self._GEO_BYPASS and
446                 self._downloader.params.get('geo_bypass', True) and
447                 not self._x_forwarded_for_ip and
448                 countries):
449             country_code = random.choice(countries)
450             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
451             if self._x_forwarded_for_ip:
452                 self.report_warning(
453                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
454                     % (self._x_forwarded_for_ip, country_code.upper()))
455                 return True
456         return False
457
458     def set_downloader(self, downloader):
459         """Sets the downloader for this IE."""
460         self._downloader = downloader
461
462     def _real_initialize(self):
463         """Real initialization process. Redefine in subclasses."""
464         pass
465
466     def _real_extract(self, url):
467         """Real extraction process. Redefine in subclasses."""
468         pass
469
470     @classmethod
471     def ie_key(cls):
472         """A string for getting the InfoExtractor with get_info_extractor"""
473         return compat_str(cls.__name__[:-2])
474
475     @property
476     def IE_NAME(self):
477         return compat_str(type(self).__name__[:-2])
478
479     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
480         """ Returns the response handle """
481         if note is None:
482             self.report_download_webpage(video_id)
483         elif note is not False:
484             if video_id is None:
485                 self.to_screen('%s' % (note,))
486             else:
487                 self.to_screen('%s: %s' % (video_id, note))
488         if isinstance(url_or_request, compat_urllib_request.Request):
489             url_or_request = update_Request(
490                 url_or_request, data=data, headers=headers, query=query)
491         else:
492             if query:
493                 url_or_request = update_url_query(url_or_request, query)
494             if data is not None or headers:
495                 url_or_request = sanitized_Request(url_or_request, data, headers)
496         try:
497             return self._downloader.urlopen(url_or_request)
498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
499             if errnote is False:
500                 return False
501             if errnote is None:
502                 errnote = 'Unable to download webpage'
503
504             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
505             if fatal:
506                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
507             else:
508                 self._downloader.report_warning(errmsg)
509                 return False
510
511     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
512         """ Returns a tuple (page content as string, URL handle) """
513         # Strip hashes from the URL (#1038)
514         if isinstance(url_or_request, (compat_str, str)):
515             url_or_request = url_or_request.partition('#')[0]
516
517         # Some sites check X-Forwarded-For HTTP header in order to figure out
518         # the origin of the client behind proxy. This allows bypassing geo
519         # restriction by faking this header's value to IP that belongs to some
520         # geo unrestricted country. We will do so once we encounter any
521         # geo restriction error.
522         if self._x_forwarded_for_ip:
523             if 'X-Forwarded-For' not in headers:
524                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
525
526         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
527         if urlh is False:
528             assert not fatal
529             return False
530         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
531         return (content, urlh)
532
533     @staticmethod
534     def _guess_encoding_from_content(content_type, webpage_bytes):
535         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
536         if m:
537             encoding = m.group(1)
538         else:
539             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
540                           webpage_bytes[:1024])
541             if m:
542                 encoding = m.group(1).decode('ascii')
543             elif webpage_bytes.startswith(b'\xff\xfe'):
544                 encoding = 'utf-16'
545             else:
546                 encoding = 'utf-8'
547
548         return encoding
549
550     def __check_blocked(self, content):
551         first_block = content[:512]
552         if ('<title>Access to this site is blocked</title>' in content and
553                 'Websense' in first_block):
554             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
555             blocked_iframe = self._html_search_regex(
556                 r'<iframe src="([^"]+)"', content,
557                 'Websense information URL', default=None)
558             if blocked_iframe:
559                 msg += ' Visit %s for more details' % blocked_iframe
560             raise ExtractorError(msg, expected=True)
561         if '<title>The URL you requested has been blocked</title>' in first_block:
562             msg = (
563                 'Access to this webpage has been blocked by Indian censorship. '
564                 'Use a VPN or proxy server (with --proxy) to route around it.')
565             block_msg = self._html_search_regex(
566                 r'</h1><p>(.*?)</p>',
567                 content, 'block message', default=None)
568             if block_msg:
569                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
570             raise ExtractorError(msg, expected=True)
571         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
572                 'blocklist.rkn.gov.ru' in content):
573             raise ExtractorError(
574                 'Access to this webpage has been blocked by decision of the Russian government. '
575                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
576                 expected=True)
577
578     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
579         content_type = urlh.headers.get('Content-Type', '')
580         webpage_bytes = urlh.read()
581         if prefix is not None:
582             webpage_bytes = prefix + webpage_bytes
583         if not encoding:
584             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
585         if self._downloader.params.get('dump_intermediate_pages', False):
586             try:
587                 url = url_or_request.get_full_url()
588             except AttributeError:
589                 url = url_or_request
590             self.to_screen('Dumping request to ' + url)
591             dump = base64.b64encode(webpage_bytes).decode('ascii')
592             self._downloader.to_screen(dump)
593         if self._downloader.params.get('write_pages', False):
594             try:
595                 url = url_or_request.get_full_url()
596             except AttributeError:
597                 url = url_or_request
598             basen = '%s_%s' % (video_id, url)
599             if len(basen) > 240:
600                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
601                 basen = basen[:240 - len(h)] + h
602             raw_filename = basen + '.dump'
603             filename = sanitize_filename(raw_filename, restricted=True)
604             self.to_screen('Saving request to ' + filename)
605             # Working around MAX_PATH limitation on Windows (see
606             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
607             if compat_os_name == 'nt':
608                 absfilepath = os.path.abspath(filename)
609                 if len(absfilepath) > 259:
610                     filename = '\\\\?\\' + absfilepath
611             with open(filename, 'wb') as outf:
612                 outf.write(webpage_bytes)
613
614         try:
615             content = webpage_bytes.decode(encoding, 'replace')
616         except LookupError:
617             content = webpage_bytes.decode('utf-8', 'replace')
618
619         self.__check_blocked(content)
620
621         return content
622
623     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
624         """ Returns the data of the page as a string """
625         success = False
626         try_count = 0
627         while success is False:
628             try:
629                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
630                 success = True
631             except compat_http_client.IncompleteRead as e:
632                 try_count += 1
633                 if try_count >= tries:
634                     raise e
635                 self._sleep(timeout, video_id)
636         if res is False:
637             return res
638         else:
639             content, _ = res
640             return content
641
642     def _download_xml(self, url_or_request, video_id,
643                       note='Downloading XML', errnote='Unable to download XML',
644                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
645         """Return the xml as an xml.etree.ElementTree.Element"""
646         xml_string = self._download_webpage(
647             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
648         if xml_string is False:
649             return xml_string
650         if transform_source:
651             xml_string = transform_source(xml_string)
652         return compat_etree_fromstring(xml_string.encode('utf-8'))
653
654     def _download_json(self, url_or_request, video_id,
655                        note='Downloading JSON metadata',
656                        errnote='Unable to download JSON metadata',
657                        transform_source=None,
658                        fatal=True, encoding=None, data=None, headers={}, query={}):
659         json_string = self._download_webpage(
660             url_or_request, video_id, note, errnote, fatal=fatal,
661             encoding=encoding, data=data, headers=headers, query=query)
662         if (not fatal) and json_string is False:
663             return None
664         return self._parse_json(
665             json_string, video_id, transform_source=transform_source, fatal=fatal)
666
667     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
668         if transform_source:
669             json_string = transform_source(json_string)
670         try:
671             return json.loads(json_string)
672         except ValueError as ve:
673             errmsg = '%s: Failed to parse JSON ' % video_id
674             if fatal:
675                 raise ExtractorError(errmsg, cause=ve)
676             else:
677                 self.report_warning(errmsg + str(ve))
678
679     def report_warning(self, msg, video_id=None):
680         idstr = '' if video_id is None else '%s: ' % video_id
681         self._downloader.report_warning(
682             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
683
684     def to_screen(self, msg):
685         """Print msg to screen, prefixing it with '[ie_name]'"""
686         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
687
688     def report_extraction(self, id_or_name):
689         """Report information extraction."""
690         self.to_screen('%s: Extracting information' % id_or_name)
691
692     def report_download_webpage(self, video_id):
693         """Report webpage download."""
694         self.to_screen('%s: Downloading webpage' % video_id)
695
696     def report_age_confirmation(self):
697         """Report attempt to confirm age."""
698         self.to_screen('Confirming age')
699
700     def report_login(self):
701         """Report attempt to log in."""
702         self.to_screen('Logging in')
703
704     @staticmethod
705     def raise_login_required(msg='This video is only available for registered users'):
706         raise ExtractorError(
707             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
708             expected=True)
709
710     @staticmethod
711     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
712         raise GeoRestrictedError(msg, countries=countries)
713
714     # Methods for following #608
715     @staticmethod
716     def url_result(url, ie=None, video_id=None, video_title=None):
717         """Returns a URL that points to a page that should be processed"""
718         # TODO: ie should be the class used for getting the info
719         video_info = {'_type': 'url',
720                       'url': url,
721                       'ie_key': ie}
722         if video_id is not None:
723             video_info['id'] = video_id
724         if video_title is not None:
725             video_info['title'] = video_title
726         return video_info
727
728     def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
729         urlrs = orderedSet(
730             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
731             for m in matches)
732         return self.playlist_result(
733             urlrs, playlist_id=video_id, playlist_title=video_title)
734
735     @staticmethod
736     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
737         """Returns a playlist"""
738         video_info = {'_type': 'playlist',
739                       'entries': entries}
740         if playlist_id:
741             video_info['id'] = playlist_id
742         if playlist_title:
743             video_info['title'] = playlist_title
744         if playlist_description:
745             video_info['description'] = playlist_description
746         return video_info
747
748     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
749         """
750         Perform a regex search on the given string, using a single or a list of
751         patterns returning the first matching group.
752         In case of failure return a default value or raise a WARNING or a
753         RegexNotFoundError, depending on fatal, specifying the field name.
754         """
755         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
756             mobj = re.search(pattern, string, flags)
757         else:
758             for p in pattern:
759                 mobj = re.search(p, string, flags)
760                 if mobj:
761                     break
762
763         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
764             _name = '\033[0;34m%s\033[0m' % name
765         else:
766             _name = name
767
768         if mobj:
769             if group is None:
770                 # return the first matching group
771                 return next(g for g in mobj.groups() if g is not None)
772             else:
773                 return mobj.group(group)
774         elif default is not NO_DEFAULT:
775             return default
776         elif fatal:
777             raise RegexNotFoundError('Unable to extract %s' % _name)
778         else:
779             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
780             return None
781
782     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
783         """
784         Like _search_regex, but strips HTML tags and unescapes entities.
785         """
786         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
787         if res:
788             return clean_html(res).strip()
789         else:
790             return res
791
792     def _get_netrc_login_info(self, netrc_machine=None):
793         username = None
794         password = None
795         netrc_machine = netrc_machine or self._NETRC_MACHINE
796
797         if self._downloader.params.get('usenetrc', False):
798             try:
799                 info = netrc.netrc().authenticators(netrc_machine)
800                 if info is not None:
801                     username = info[0]
802                     password = info[2]
803                 else:
804                     raise netrc.NetrcParseError(
805                         'No authenticators for %s' % netrc_machine)
806             except (IOError, netrc.NetrcParseError) as err:
807                 self._downloader.report_warning(
808                     'parsing .netrc: %s' % error_to_compat_str(err))
809
810         return username, password
811
812     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
813         """
814         Get the login info as (username, password)
815         First look for the manually specified credentials using username_option
816         and password_option as keys in params dictionary. If no such credentials
817         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
818         value.
819         If there's no info available, return (None, None)
820         """
821         if self._downloader is None:
822             return (None, None)
823
824         downloader_params = self._downloader.params
825
826         # Attempt to use provided username and password or .netrc data
827         if downloader_params.get(username_option) is not None:
828             username = downloader_params[username_option]
829             password = downloader_params[password_option]
830         else:
831             username, password = self._get_netrc_login_info(netrc_machine)
832
833         return username, password
834
835     def _get_tfa_info(self, note='two-factor verification code'):
836         """
837         Get the two-factor authentication info
838         TODO - asking the user will be required for sms/phone verify
839         currently just uses the command line option
840         If there's no info available, return None
841         """
842         if self._downloader is None:
843             return None
844         downloader_params = self._downloader.params
845
846         if downloader_params.get('twofactor') is not None:
847             return downloader_params['twofactor']
848
849         return compat_getpass('Type %s and press [Return]: ' % note)
850
851     # Helper functions for extracting OpenGraph info
852     @staticmethod
853     def _og_regexes(prop):
854         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
855         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
856                        % {'prop': re.escape(prop)})
857         template = r'<meta[^>]+?%s[^>]+?%s'
858         return [
859             template % (property_re, content_re),
860             template % (content_re, property_re),
861         ]
862
863     @staticmethod
864     def _meta_regex(prop):
865         return r'''(?isx)<meta
866                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
867                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
868
869     def _og_search_property(self, prop, html, name=None, **kargs):
870         if not isinstance(prop, (list, tuple)):
871             prop = [prop]
872         if name is None:
873             name = 'OpenGraph %s' % prop[0]
874         og_regexes = []
875         for p in prop:
876             og_regexes.extend(self._og_regexes(p))
877         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
878         if escaped is None:
879             return None
880         return unescapeHTML(escaped)
881
882     def _og_search_thumbnail(self, html, **kargs):
883         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
884
885     def _og_search_description(self, html, **kargs):
886         return self._og_search_property('description', html, fatal=False, **kargs)
887
888     def _og_search_title(self, html, **kargs):
889         return self._og_search_property('title', html, **kargs)
890
891     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
892         regexes = self._og_regexes('video') + self._og_regexes('video:url')
893         if secure:
894             regexes = self._og_regexes('video:secure_url') + regexes
895         return self._html_search_regex(regexes, html, name, **kargs)
896
897     def _og_search_url(self, html, **kargs):
898         return self._og_search_property('url', html, **kargs)
899
900     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
901         if not isinstance(name, (list, tuple)):
902             name = [name]
903         if display_name is None:
904             display_name = name[0]
905         return self._html_search_regex(
906             [self._meta_regex(n) for n in name],
907             html, display_name, fatal=fatal, group='content', **kwargs)
908
909     def _dc_search_uploader(self, html):
910         return self._html_search_meta('dc.creator', html, 'uploader')
911
912     def _rta_search(self, html):
913         # See http://www.rtalabel.org/index.php?content=howtofaq#single
914         if re.search(r'(?ix)<meta\s+name="rating"\s+'
915                      r'     content="RTA-5042-1996-1400-1577-RTA"',
916                      html):
917             return 18
918         return 0
919
920     def _media_rating_search(self, html):
921         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
922         rating = self._html_search_meta('rating', html)
923
924         if not rating:
925             return None
926
927         RATING_TABLE = {
928             'safe for kids': 0,
929             'general': 8,
930             '14 years': 14,
931             'mature': 17,
932             'restricted': 19,
933         }
934         return RATING_TABLE.get(rating.lower())
935
936     def _family_friendly_search(self, html):
937         # See http://schema.org/VideoObject
938         family_friendly = self._html_search_meta('isFamilyFriendly', html)
939
940         if not family_friendly:
941             return None
942
943         RATING_TABLE = {
944             '1': 0,
945             'true': 0,
946             '0': 18,
947             'false': 18,
948         }
949         return RATING_TABLE.get(family_friendly.lower())
950
951     def _twitter_search_player(self, html):
952         return self._html_search_meta('twitter:player', html,
953                                       'twitter card player')
954
955     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
956         json_ld = self._search_regex(
957             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
958             html, 'JSON-LD', group='json_ld', **kwargs)
959         default = kwargs.get('default', NO_DEFAULT)
960         if not json_ld:
961             return default if default is not NO_DEFAULT else {}
962         # JSON-LD may be malformed and thus `fatal` should be respected.
963         # At the same time `default` may be passed that assumes `fatal=False`
964         # for _search_regex. Let's simulate the same behavior here as well.
965         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
966         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
967
968     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
969         if isinstance(json_ld, compat_str):
970             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
971         if not json_ld:
972             return {}
973         info = {}
974         if not isinstance(json_ld, (list, tuple, dict)):
975             return info
976         if isinstance(json_ld, dict):
977             json_ld = [json_ld]
978         for e in json_ld:
979             if e.get('@context') == 'http://schema.org':
980                 item_type = e.get('@type')
981                 if expected_type is not None and expected_type != item_type:
982                     return info
983                 if item_type == 'TVEpisode':
984                     info.update({
985                         'episode': unescapeHTML(e.get('name')),
986                         'episode_number': int_or_none(e.get('episodeNumber')),
987                         'description': unescapeHTML(e.get('description')),
988                     })
989                     part_of_season = e.get('partOfSeason')
990                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
991                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
992                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
993                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
994                         info['series'] = unescapeHTML(part_of_series.get('name'))
995                 elif item_type == 'Article':
996                     info.update({
997                         'timestamp': parse_iso8601(e.get('datePublished')),
998                         'title': unescapeHTML(e.get('headline')),
999                         'description': unescapeHTML(e.get('articleBody')),
1000                     })
1001                 elif item_type == 'VideoObject':
1002                     info.update({
1003                         'url': e.get('contentUrl'),
1004                         'title': unescapeHTML(e.get('name')),
1005                         'description': unescapeHTML(e.get('description')),
1006                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1007                         'duration': parse_duration(e.get('duration')),
1008                         'timestamp': unified_timestamp(e.get('uploadDate')),
1009                         'filesize': float_or_none(e.get('contentSize')),
1010                         'tbr': int_or_none(e.get('bitrate')),
1011                         'width': int_or_none(e.get('width')),
1012                         'height': int_or_none(e.get('height')),
1013                     })
1014                 break
1015         return dict((k, v) for k, v in info.items() if v is not None)
1016
1017     @staticmethod
1018     def _hidden_inputs(html):
1019         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1020         hidden_inputs = {}
1021         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1022             attrs = extract_attributes(input)
1023             if not input:
1024                 continue
1025             if attrs.get('type') not in ('hidden', 'submit'):
1026                 continue
1027             name = attrs.get('name') or attrs.get('id')
1028             value = attrs.get('value')
1029             if name and value is not None:
1030                 hidden_inputs[name] = value
1031         return hidden_inputs
1032
1033     def _form_hidden_inputs(self, form_id, html):
1034         form = self._search_regex(
1035             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1036             html, '%s form' % form_id, group='form')
1037         return self._hidden_inputs(form)
1038
1039     def _sort_formats(self, formats, field_preference=None):
1040         if not formats:
1041             raise ExtractorError('No video formats found')
1042
1043         for f in formats:
1044             # Automatically determine tbr when missing based on abr and vbr (improves
1045             # formats sorting in some cases)
1046             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1047                 f['tbr'] = f['abr'] + f['vbr']
1048
1049         def _formats_key(f):
1050             # TODO remove the following workaround
1051             from ..utils import determine_ext
1052             if not f.get('ext') and 'url' in f:
1053                 f['ext'] = determine_ext(f['url'])
1054
1055             if isinstance(field_preference, (list, tuple)):
1056                 return tuple(
1057                     f.get(field)
1058                     if f.get(field) is not None
1059                     else ('' if field == 'format_id' else -1)
1060                     for field in field_preference)
1061
1062             preference = f.get('preference')
1063             if preference is None:
1064                 preference = 0
1065                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1066                     preference -= 0.5
1067
1068             protocol = f.get('protocol') or determine_protocol(f)
1069             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1070
1071             if f.get('vcodec') == 'none':  # audio only
1072                 preference -= 50
1073                 if self._downloader.params.get('prefer_free_formats'):
1074                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1075                 else:
1076                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1077                 ext_preference = 0
1078                 try:
1079                     audio_ext_preference = ORDER.index(f['ext'])
1080                 except ValueError:
1081                     audio_ext_preference = -1
1082             else:
1083                 if f.get('acodec') == 'none':  # video only
1084                     preference -= 40
1085                 if self._downloader.params.get('prefer_free_formats'):
1086                     ORDER = ['flv', 'mp4', 'webm']
1087                 else:
1088                     ORDER = ['webm', 'flv', 'mp4']
1089                 try:
1090                     ext_preference = ORDER.index(f['ext'])
1091                 except ValueError:
1092                     ext_preference = -1
1093                 audio_ext_preference = 0
1094
1095             return (
1096                 preference,
1097                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1098                 f.get('quality') if f.get('quality') is not None else -1,
1099                 f.get('tbr') if f.get('tbr') is not None else -1,
1100                 f.get('filesize') if f.get('filesize') is not None else -1,
1101                 f.get('vbr') if f.get('vbr') is not None else -1,
1102                 f.get('height') if f.get('height') is not None else -1,
1103                 f.get('width') if f.get('width') is not None else -1,
1104                 proto_preference,
1105                 ext_preference,
1106                 f.get('abr') if f.get('abr') is not None else -1,
1107                 audio_ext_preference,
1108                 f.get('fps') if f.get('fps') is not None else -1,
1109                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1110                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1111                 f.get('format_id') if f.get('format_id') is not None else '',
1112             )
1113         formats.sort(key=_formats_key)
1114
1115     def _check_formats(self, formats, video_id):
1116         if formats:
1117             formats[:] = filter(
1118                 lambda f: self._is_valid_url(
1119                     f['url'], video_id,
1120                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1121                 formats)
1122
1123     @staticmethod
1124     def _remove_duplicate_formats(formats):
1125         format_urls = set()
1126         unique_formats = []
1127         for f in formats:
1128             if f['url'] not in format_urls:
1129                 format_urls.add(f['url'])
1130                 unique_formats.append(f)
1131         formats[:] = unique_formats
1132
1133     def _is_valid_url(self, url, video_id, item='video', headers={}):
1134         url = self._proto_relative_url(url, scheme='http:')
1135         # For now assume non HTTP(S) URLs always valid
1136         if not (url.startswith('http://') or url.startswith('https://')):
1137             return True
1138         try:
1139             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1140             return True
1141         except ExtractorError as e:
1142             if isinstance(e.cause, compat_urllib_error.URLError):
1143                 self.to_screen(
1144                     '%s: %s URL is invalid, skipping' % (video_id, item))
1145                 return False
1146             raise
1147
1148     def http_scheme(self):
1149         """ Either "http:" or "https:", depending on the user's preferences """
1150         return (
1151             'http:'
1152             if self._downloader.params.get('prefer_insecure', False)
1153             else 'https:')
1154
1155     def _proto_relative_url(self, url, scheme=None):
1156         if url is None:
1157             return url
1158         if url.startswith('//'):
1159             if scheme is None:
1160                 scheme = self.http_scheme()
1161             return scheme + url
1162         else:
1163             return url
1164
1165     def _sleep(self, timeout, video_id, msg_template=None):
1166         if msg_template is None:
1167             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1168         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1169         self.to_screen(msg)
1170         time.sleep(timeout)
1171
1172     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1173                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1174                              fatal=True, m3u8_id=None):
1175         manifest = self._download_xml(
1176             manifest_url, video_id, 'Downloading f4m manifest',
1177             'Unable to download f4m manifest',
1178             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1179             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1180             transform_source=transform_source,
1181             fatal=fatal)
1182
1183         if manifest is False:
1184             return []
1185
1186         return self._parse_f4m_formats(
1187             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1188             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1189
1190     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1191                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1192                            fatal=True, m3u8_id=None):
1193         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1194         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1195         if akamai_pv is not None and ';' in akamai_pv.text:
1196             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1197             if playerVerificationChallenge.strip() != '':
1198                 return []
1199
1200         formats = []
1201         manifest_version = '1.0'
1202         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1203         if not media_nodes:
1204             manifest_version = '2.0'
1205             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1206         # Remove unsupported DRM protected media from final formats
1207         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1208         media_nodes = remove_encrypted_media(media_nodes)
1209         if not media_nodes:
1210             return formats
1211         base_url = xpath_text(
1212             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1213             'base URL', default=None)
1214         if base_url:
1215             base_url = base_url.strip()
1216
1217         bootstrap_info = xpath_element(
1218             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1219             'bootstrap info', default=None)
1220
1221         vcodec = None
1222         mime_type = xpath_text(
1223             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1224             'base URL', default=None)
1225         if mime_type and mime_type.startswith('audio/'):
1226             vcodec = 'none'
1227
1228         for i, media_el in enumerate(media_nodes):
1229             tbr = int_or_none(media_el.attrib.get('bitrate'))
1230             width = int_or_none(media_el.attrib.get('width'))
1231             height = int_or_none(media_el.attrib.get('height'))
1232             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1233             # If <bootstrapInfo> is present, the specified f4m is a
1234             # stream-level manifest, and only set-level manifests may refer to
1235             # external resources.  See section 11.4 and section 4 of F4M spec
1236             if bootstrap_info is None:
1237                 media_url = None
1238                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1239                 if manifest_version == '2.0':
1240                     media_url = media_el.attrib.get('href')
1241                 if media_url is None:
1242                     media_url = media_el.attrib.get('url')
1243                 if not media_url:
1244                     continue
1245                 manifest_url = (
1246                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1247                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1248                 # If media_url is itself a f4m manifest do the recursive extraction
1249                 # since bitrates in parent manifest (this one) and media_url manifest
1250                 # may differ leading to inability to resolve the format by requested
1251                 # bitrate in f4m downloader
1252                 ext = determine_ext(manifest_url)
1253                 if ext == 'f4m':
1254                     f4m_formats = self._extract_f4m_formats(
1255                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1256                         transform_source=transform_source, fatal=fatal)
1257                     # Sometimes stream-level manifest contains single media entry that
1258                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1259                     # At the same time parent's media entry in set-level manifest may
1260                     # contain it. We will copy it from parent in such cases.
1261                     if len(f4m_formats) == 1:
1262                         f = f4m_formats[0]
1263                         f.update({
1264                             'tbr': f.get('tbr') or tbr,
1265                             'width': f.get('width') or width,
1266                             'height': f.get('height') or height,
1267                             'format_id': f.get('format_id') if not tbr else format_id,
1268                             'vcodec': vcodec,
1269                         })
1270                     formats.extend(f4m_formats)
1271                     continue
1272                 elif ext == 'm3u8':
1273                     formats.extend(self._extract_m3u8_formats(
1274                         manifest_url, video_id, 'mp4', preference=preference,
1275                         m3u8_id=m3u8_id, fatal=fatal))
1276                     continue
1277             formats.append({
1278                 'format_id': format_id,
1279                 'url': manifest_url,
1280                 'manifest_url': manifest_url,
1281                 'ext': 'flv' if bootstrap_info is not None else None,
1282                 'tbr': tbr,
1283                 'width': width,
1284                 'height': height,
1285                 'vcodec': vcodec,
1286                 'preference': preference,
1287             })
1288         return formats
1289
1290     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1291         return {
1292             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1293             'url': m3u8_url,
1294             'ext': ext,
1295             'protocol': 'm3u8',
1296             'preference': preference - 100 if preference else -100,
1297             'resolution': 'multiple',
1298             'format_note': 'Quality selection URL',
1299         }
1300
1301     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1302                               entry_protocol='m3u8', preference=None,
1303                               m3u8_id=None, note=None, errnote=None,
1304                               fatal=True, live=False):
1305
1306         res = self._download_webpage_handle(
1307             m3u8_url, video_id,
1308             note=note or 'Downloading m3u8 information',
1309             errnote=errnote or 'Failed to download m3u8 information',
1310             fatal=fatal)
1311         if res is False:
1312             return []
1313         m3u8_doc, urlh = res
1314         m3u8_url = urlh.geturl()
1315
1316         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1317             return []
1318
1319         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1320
1321         format_url = lambda u: (
1322             u
1323             if re.match(r'^https?://', u)
1324             else compat_urlparse.urljoin(m3u8_url, u))
1325
1326         # We should try extracting formats only from master playlists [1], i.e.
1327         # playlists that describe available qualities. On the other hand media
1328         # playlists [2] should be returned as is since they contain just the media
1329         # without qualities renditions.
1330         # Fortunately, master playlist can be easily distinguished from media
1331         # playlist based on particular tags availability. As of [1, 2] master
1332         # playlist tags MUST NOT appear in a media playist and vice versa.
1333         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1334         # and MUST NOT appear in master playlist thus we can clearly detect media
1335         # playlist with this criterion.
1336         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1337         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1338         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1339         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1340             return [{
1341                 'url': m3u8_url,
1342                 'format_id': m3u8_id,
1343                 'ext': ext,
1344                 'protocol': entry_protocol,
1345                 'preference': preference,
1346             }]
1347         audio_in_video_stream = {}
1348         last_info = {}
1349         last_media = {}
1350         for line in m3u8_doc.splitlines():
1351             if line.startswith('#EXT-X-STREAM-INF:'):
1352                 last_info = parse_m3u8_attributes(line)
1353             elif line.startswith('#EXT-X-MEDIA:'):
1354                 media = parse_m3u8_attributes(line)
1355                 media_type = media.get('TYPE')
1356                 if media_type in ('VIDEO', 'AUDIO'):
1357                     group_id = media.get('GROUP-ID')
1358                     media_url = media.get('URI')
1359                     if media_url:
1360                         format_id = []
1361                         for v in (group_id, media.get('NAME')):
1362                             if v:
1363                                 format_id.append(v)
1364                         f = {
1365                             'format_id': '-'.join(format_id),
1366                             'url': format_url(media_url),
1367                             'language': media.get('LANGUAGE'),
1368                             'ext': ext,
1369                             'protocol': entry_protocol,
1370                             'preference': preference,
1371                         }
1372                         if media_type == 'AUDIO':
1373                             f['vcodec'] = 'none'
1374                             if group_id and not audio_in_video_stream.get(group_id):
1375                                 audio_in_video_stream[group_id] = False
1376                         formats.append(f)
1377                     else:
1378                         # When there is no URI in EXT-X-MEDIA let this tag's
1379                         # data be used by regular URI lines below
1380                         last_media = media
1381                         if media_type == 'AUDIO' and group_id:
1382                             audio_in_video_stream[group_id] = True
1383             elif line.startswith('#') or not line.strip():
1384                 continue
1385             else:
1386                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1387                 format_id = []
1388                 if m3u8_id:
1389                     format_id.append(m3u8_id)
1390                 # Despite specification does not mention NAME attribute for
1391                 # EXT-X-STREAM-INF it still sometimes may be present
1392                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1393                 # Bandwidth of live streams may differ over time thus making
1394                 # format_id unpredictable. So it's better to keep provided
1395                 # format_id intact.
1396                 if not live:
1397                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1398                 manifest_url = format_url(line.strip())
1399                 f = {
1400                     'format_id': '-'.join(format_id),
1401                     'url': manifest_url,
1402                     'manifest_url': manifest_url,
1403                     'tbr': tbr,
1404                     'ext': ext,
1405                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1406                     'protocol': entry_protocol,
1407                     'preference': preference,
1408                 }
1409                 resolution = last_info.get('RESOLUTION')
1410                 if resolution:
1411                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1412                     if mobj:
1413                         f['width'] = int(mobj.group('width'))
1414                         f['height'] = int(mobj.group('height'))
1415                 # Unified Streaming Platform
1416                 mobj = re.search(
1417                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1418                 if mobj:
1419                     abr, vbr = mobj.groups()
1420                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1421                     f.update({
1422                         'vbr': vbr,
1423                         'abr': abr,
1424                     })
1425                 f.update(parse_codecs(last_info.get('CODECS')))
1426                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1427                     # TODO: update acodec for audio only formats with the same GROUP-ID
1428                     f['acodec'] = 'none'
1429                 formats.append(f)
1430                 last_info = {}
1431                 last_media = {}
1432         return formats
1433
1434     @staticmethod
1435     def _xpath_ns(path, namespace=None):
1436         if not namespace:
1437             return path
1438         out = []
1439         for c in path.split('/'):
1440             if not c or c == '.':
1441                 out.append(c)
1442             else:
1443                 out.append('{%s}%s' % (namespace, c))
1444         return '/'.join(out)
1445
1446     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1447         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1448
1449         if smil is False:
1450             assert not fatal
1451             return []
1452
1453         namespace = self._parse_smil_namespace(smil)
1454
1455         return self._parse_smil_formats(
1456             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1457
1458     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1459         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1460         if smil is False:
1461             return {}
1462         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1463
1464     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1465         return self._download_xml(
1466             smil_url, video_id, 'Downloading SMIL file',
1467             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1468
1469     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1470         namespace = self._parse_smil_namespace(smil)
1471
1472         formats = self._parse_smil_formats(
1473             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1474         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1475
1476         video_id = os.path.splitext(url_basename(smil_url))[0]
1477         title = None
1478         description = None
1479         upload_date = None
1480         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1481             name = meta.attrib.get('name')
1482             content = meta.attrib.get('content')
1483             if not name or not content:
1484                 continue
1485             if not title and name == 'title':
1486                 title = content
1487             elif not description and name in ('description', 'abstract'):
1488                 description = content
1489             elif not upload_date and name == 'date':
1490                 upload_date = unified_strdate(content)
1491
1492         thumbnails = [{
1493             'id': image.get('type'),
1494             'url': image.get('src'),
1495             'width': int_or_none(image.get('width')),
1496             'height': int_or_none(image.get('height')),
1497         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1498
1499         return {
1500             'id': video_id,
1501             'title': title or video_id,
1502             'description': description,
1503             'upload_date': upload_date,
1504             'thumbnails': thumbnails,
1505             'formats': formats,
1506             'subtitles': subtitles,
1507         }
1508
1509     def _parse_smil_namespace(self, smil):
1510         return self._search_regex(
1511             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1512
1513     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1514         base = smil_url
1515         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1516             b = meta.get('base') or meta.get('httpBase')
1517             if b:
1518                 base = b
1519                 break
1520
1521         formats = []
1522         rtmp_count = 0
1523         http_count = 0
1524         m3u8_count = 0
1525
1526         srcs = []
1527         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1528         for medium in media:
1529             src = medium.get('src')
1530             if not src or src in srcs:
1531                 continue
1532             srcs.append(src)
1533
1534             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1535             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1536             width = int_or_none(medium.get('width'))
1537             height = int_or_none(medium.get('height'))
1538             proto = medium.get('proto')
1539             ext = medium.get('ext')
1540             src_ext = determine_ext(src)
1541             streamer = medium.get('streamer') or base
1542
1543             if proto == 'rtmp' or streamer.startswith('rtmp'):
1544                 rtmp_count += 1
1545                 formats.append({
1546                     'url': streamer,
1547                     'play_path': src,
1548                     'ext': 'flv',
1549                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1550                     'tbr': bitrate,
1551                     'filesize': filesize,
1552                     'width': width,
1553                     'height': height,
1554                 })
1555                 if transform_rtmp_url:
1556                     streamer, src = transform_rtmp_url(streamer, src)
1557                     formats[-1].update({
1558                         'url': streamer,
1559                         'play_path': src,
1560                     })
1561                 continue
1562
1563             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1564             src_url = src_url.strip()
1565
1566             if proto == 'm3u8' or src_ext == 'm3u8':
1567                 m3u8_formats = self._extract_m3u8_formats(
1568                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1569                 if len(m3u8_formats) == 1:
1570                     m3u8_count += 1
1571                     m3u8_formats[0].update({
1572                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1573                         'tbr': bitrate,
1574                         'width': width,
1575                         'height': height,
1576                     })
1577                 formats.extend(m3u8_formats)
1578                 continue
1579
1580             if src_ext == 'f4m':
1581                 f4m_url = src_url
1582                 if not f4m_params:
1583                     f4m_params = {
1584                         'hdcore': '3.2.0',
1585                         'plugin': 'flowplayer-3.2.0.1',
1586                     }
1587                 f4m_url += '&' if '?' in f4m_url else '?'
1588                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1589                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1590                 continue
1591
1592             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1593                 http_count += 1
1594                 formats.append({
1595                     'url': src_url,
1596                     'ext': ext or src_ext or 'flv',
1597                     'format_id': 'http-%d' % (bitrate or http_count),
1598                     'tbr': bitrate,
1599                     'filesize': filesize,
1600                     'width': width,
1601                     'height': height,
1602                 })
1603                 continue
1604
1605         return formats
1606
1607     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1608         urls = []
1609         subtitles = {}
1610         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1611             src = textstream.get('src')
1612             if not src or src in urls:
1613                 continue
1614             urls.append(src)
1615             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1616             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1617             subtitles.setdefault(lang, []).append({
1618                 'url': src,
1619                 'ext': ext,
1620             })
1621         return subtitles
1622
1623     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1624         xspf = self._download_xml(
1625             playlist_url, playlist_id, 'Downloading xpsf playlist',
1626             'Unable to download xspf manifest', fatal=fatal)
1627         if xspf is False:
1628             return []
1629         return self._parse_xspf(xspf, playlist_id)
1630
1631     def _parse_xspf(self, playlist, playlist_id):
1632         NS_MAP = {
1633             'xspf': 'http://xspf.org/ns/0/',
1634             's1': 'http://static.streamone.nl/player/ns/0',
1635         }
1636
1637         entries = []
1638         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1639             title = xpath_text(
1640                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1641             description = xpath_text(
1642                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1643             thumbnail = xpath_text(
1644                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1645             duration = float_or_none(
1646                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1647
1648             formats = [{
1649                 'url': location.text,
1650                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1651                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1652                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1653             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1654             self._sort_formats(formats)
1655
1656             entries.append({
1657                 'id': playlist_id,
1658                 'title': title,
1659                 'description': description,
1660                 'thumbnail': thumbnail,
1661                 'duration': duration,
1662                 'formats': formats,
1663             })
1664         return entries
1665
1666     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1667         res = self._download_webpage_handle(
1668             mpd_url, video_id,
1669             note=note or 'Downloading MPD manifest',
1670             errnote=errnote or 'Failed to download MPD manifest',
1671             fatal=fatal)
1672         if res is False:
1673             return []
1674         mpd, urlh = res
1675         mpd_base_url = base_url(urlh.geturl())
1676
1677         return self._parse_mpd_formats(
1678             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1679             formats_dict=formats_dict, mpd_url=mpd_url)
1680
1681     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1682         """
1683         Parse formats from MPD manifest.
1684         References:
1685          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1686             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1687          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1688         """
1689         if mpd_doc.get('type') == 'dynamic':
1690             return []
1691
1692         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1693
1694         def _add_ns(path):
1695             return self._xpath_ns(path, namespace)
1696
1697         def is_drm_protected(element):
1698             return element.find(_add_ns('ContentProtection')) is not None
1699
1700         def extract_multisegment_info(element, ms_parent_info):
1701             ms_info = ms_parent_info.copy()
1702
1703             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1704             # common attributes and elements.  We will only extract relevant
1705             # for us.
1706             def extract_common(source):
1707                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1708                 if segment_timeline is not None:
1709                     s_e = segment_timeline.findall(_add_ns('S'))
1710                     if s_e:
1711                         ms_info['total_number'] = 0
1712                         ms_info['s'] = []
1713                         for s in s_e:
1714                             r = int(s.get('r', 0))
1715                             ms_info['total_number'] += 1 + r
1716                             ms_info['s'].append({
1717                                 't': int(s.get('t', 0)),
1718                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1719                                 'd': int(s.attrib['d']),
1720                                 'r': r,
1721                             })
1722                 start_number = source.get('startNumber')
1723                 if start_number:
1724                     ms_info['start_number'] = int(start_number)
1725                 timescale = source.get('timescale')
1726                 if timescale:
1727                     ms_info['timescale'] = int(timescale)
1728                 segment_duration = source.get('duration')
1729                 if segment_duration:
1730                     ms_info['segment_duration'] = int(segment_duration)
1731
1732             def extract_Initialization(source):
1733                 initialization = source.find(_add_ns('Initialization'))
1734                 if initialization is not None:
1735                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1736
1737             segment_list = element.find(_add_ns('SegmentList'))
1738             if segment_list is not None:
1739                 extract_common(segment_list)
1740                 extract_Initialization(segment_list)
1741                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1742                 if segment_urls_e:
1743                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1744             else:
1745                 segment_template = element.find(_add_ns('SegmentTemplate'))
1746                 if segment_template is not None:
1747                     extract_common(segment_template)
1748                     media = segment_template.get('media')
1749                     if media:
1750                         ms_info['media'] = media
1751                     initialization = segment_template.get('initialization')
1752                     if initialization:
1753                         ms_info['initialization'] = initialization
1754                     else:
1755                         extract_Initialization(segment_template)
1756             return ms_info
1757
1758         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1759         formats = []
1760         for period in mpd_doc.findall(_add_ns('Period')):
1761             period_duration = parse_duration(period.get('duration')) or mpd_duration
1762             period_ms_info = extract_multisegment_info(period, {
1763                 'start_number': 1,
1764                 'timescale': 1,
1765             })
1766             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1767                 if is_drm_protected(adaptation_set):
1768                     continue
1769                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1770                 for representation in adaptation_set.findall(_add_ns('Representation')):
1771                     if is_drm_protected(representation):
1772                         continue
1773                     representation_attrib = adaptation_set.attrib.copy()
1774                     representation_attrib.update(representation.attrib)
1775                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1776                     mime_type = representation_attrib['mimeType']
1777                     content_type = mime_type.split('/')[0]
1778                     if content_type == 'text':
1779                         # TODO implement WebVTT downloading
1780                         pass
1781                     elif content_type == 'video' or content_type == 'audio':
1782                         base_url = ''
1783                         for element in (representation, adaptation_set, period, mpd_doc):
1784                             base_url_e = element.find(_add_ns('BaseURL'))
1785                             if base_url_e is not None:
1786                                 base_url = base_url_e.text + base_url
1787                                 if re.match(r'^https?://', base_url):
1788                                     break
1789                         if mpd_base_url and not re.match(r'^https?://', base_url):
1790                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1791                                 mpd_base_url += '/'
1792                             base_url = mpd_base_url + base_url
1793                         representation_id = representation_attrib.get('id')
1794                         lang = representation_attrib.get('lang')
1795                         url_el = representation.find(_add_ns('BaseURL'))
1796                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1797                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1798                         f = {
1799                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1800                             'url': base_url,
1801                             'manifest_url': mpd_url,
1802                             'ext': mimetype2ext(mime_type),
1803                             'width': int_or_none(representation_attrib.get('width')),
1804                             'height': int_or_none(representation_attrib.get('height')),
1805                             'tbr': int_or_none(bandwidth, 1000),
1806                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1807                             'fps': int_or_none(representation_attrib.get('frameRate')),
1808                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1809                             'format_note': 'DASH %s' % content_type,
1810                             'filesize': filesize,
1811                         }
1812                         f.update(parse_codecs(representation_attrib.get('codecs')))
1813                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1814
1815                         def prepare_template(template_name, identifiers):
1816                             t = representation_ms_info[template_name]
1817                             t = t.replace('$RepresentationID$', representation_id)
1818                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1819                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1820                             t.replace('$$', '$')
1821                             return t
1822
1823                         # @initialization is a regular template like @media one
1824                         # so it should be handled just the same way (see
1825                         # https://github.com/rg3/youtube-dl/issues/11605)
1826                         if 'initialization' in representation_ms_info:
1827                             initialization_template = prepare_template(
1828                                 'initialization',
1829                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1830                                 # $Time$ shall not be included for @initialization thus
1831                                 # only $Bandwidth$ remains
1832                                 ('Bandwidth', ))
1833                             representation_ms_info['initialization_url'] = initialization_template % {
1834                                 'Bandwidth': bandwidth,
1835                             }
1836
1837                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1838
1839                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1840
1841                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1842                             # can't be used at the same time
1843                             if '%(Number' in media_template and 's' not in representation_ms_info:
1844                                 segment_duration = None
1845                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1846                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1847                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1848                                 representation_ms_info['fragments'] = [{
1849                                     'url': media_template % {
1850                                         'Number': segment_number,
1851                                         'Bandwidth': bandwidth,
1852                                     },
1853                                     'duration': segment_duration,
1854                                 } for segment_number in range(
1855                                     representation_ms_info['start_number'],
1856                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1857                             else:
1858                                 # $Number*$ or $Time$ in media template with S list available
1859                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1860                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1861                                 representation_ms_info['fragments'] = []
1862                                 segment_time = 0
1863                                 segment_d = None
1864                                 segment_number = representation_ms_info['start_number']
1865
1866                                 def add_segment_url():
1867                                     segment_url = media_template % {
1868                                         'Time': segment_time,
1869                                         'Bandwidth': bandwidth,
1870                                         'Number': segment_number,
1871                                     }
1872                                     representation_ms_info['fragments'].append({
1873                                         'url': segment_url,
1874                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1875                                     })
1876
1877                                 for num, s in enumerate(representation_ms_info['s']):
1878                                     segment_time = s.get('t') or segment_time
1879                                     segment_d = s['d']
1880                                     add_segment_url()
1881                                     segment_number += 1
1882                                     for r in range(s.get('r', 0)):
1883                                         segment_time += segment_d
1884                                         add_segment_url()
1885                                         segment_number += 1
1886                                     segment_time += segment_d
1887                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1888                             # No media template
1889                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1890                             # or any YouTube dashsegments video
1891                             fragments = []
1892                             segment_index = 0
1893                             timescale = representation_ms_info['timescale']
1894                             for s in representation_ms_info['s']:
1895                                 duration = float_or_none(s['d'], timescale)
1896                                 for r in range(s.get('r', 0) + 1):
1897                                     fragments.append({
1898                                         'url': representation_ms_info['segment_urls'][segment_index],
1899                                         'duration': duration,
1900                                     })
1901                                     segment_index += 1
1902                             representation_ms_info['fragments'] = fragments
1903                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1904                         # No fragments key is present in this case.
1905                         if 'fragments' in representation_ms_info:
1906                             f.update({
1907                                 'fragments': [],
1908                                 'protocol': 'http_dash_segments',
1909                             })
1910                             if 'initialization_url' in representation_ms_info:
1911                                 initialization_url = representation_ms_info['initialization_url']
1912                                 if not f.get('url'):
1913                                     f['url'] = initialization_url
1914                                 f['fragments'].append({'url': initialization_url})
1915                             f['fragments'].extend(representation_ms_info['fragments'])
1916                             for fragment in f['fragments']:
1917                                 fragment['url'] = urljoin(base_url, fragment['url'])
1918                         try:
1919                             existing_format = next(
1920                                 fo for fo in formats
1921                                 if fo['format_id'] == representation_id)
1922                         except StopIteration:
1923                             full_info = formats_dict.get(representation_id, {}).copy()
1924                             full_info.update(f)
1925                             formats.append(full_info)
1926                         else:
1927                             existing_format.update(f)
1928                     else:
1929                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1930         return formats
1931
1932     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1933         res = self._download_webpage_handle(
1934             ism_url, video_id,
1935             note=note or 'Downloading ISM manifest',
1936             errnote=errnote or 'Failed to download ISM manifest',
1937             fatal=fatal)
1938         if res is False:
1939             return []
1940         ism, urlh = res
1941
1942         return self._parse_ism_formats(
1943             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1944
1945     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1946         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1947             return []
1948
1949         duration = int(ism_doc.attrib['Duration'])
1950         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1951
1952         formats = []
1953         for stream in ism_doc.findall('StreamIndex'):
1954             stream_type = stream.get('Type')
1955             if stream_type not in ('video', 'audio'):
1956                 continue
1957             url_pattern = stream.attrib['Url']
1958             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1959             stream_name = stream.get('Name')
1960             for track in stream.findall('QualityLevel'):
1961                 fourcc = track.get('FourCC')
1962                 # TODO: add support for WVC1 and WMAP
1963                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1964                     self.report_warning('%s is not a supported codec' % fourcc)
1965                     continue
1966                 tbr = int(track.attrib['Bitrate']) // 1000
1967                 width = int_or_none(track.get('MaxWidth'))
1968                 height = int_or_none(track.get('MaxHeight'))
1969                 sampling_rate = int_or_none(track.get('SamplingRate'))
1970
1971                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1972                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1973
1974                 fragments = []
1975                 fragment_ctx = {
1976                     'time': 0,
1977                 }
1978                 stream_fragments = stream.findall('c')
1979                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1980                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1981                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1982                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1983                     if not fragment_ctx['duration']:
1984                         try:
1985                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1986                         except IndexError:
1987                             next_fragment_time = duration
1988                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1989                     for _ in range(fragment_repeat):
1990                         fragments.append({
1991                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1992                             'duration': fragment_ctx['duration'] / stream_timescale,
1993                         })
1994                         fragment_ctx['time'] += fragment_ctx['duration']
1995
1996                 format_id = []
1997                 if ism_id:
1998                     format_id.append(ism_id)
1999                 if stream_name:
2000                     format_id.append(stream_name)
2001                 format_id.append(compat_str(tbr))
2002
2003                 formats.append({
2004                     'format_id': '-'.join(format_id),
2005                     'url': ism_url,
2006                     'manifest_url': ism_url,
2007                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2008                     'width': width,
2009                     'height': height,
2010                     'tbr': tbr,
2011                     'asr': sampling_rate,
2012                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2013                     'acodec': 'none' if stream_type == 'video' else fourcc,
2014                     'protocol': 'ism',
2015                     'fragments': fragments,
2016                     '_download_params': {
2017                         'duration': duration,
2018                         'timescale': stream_timescale,
2019                         'width': width or 0,
2020                         'height': height or 0,
2021                         'fourcc': fourcc,
2022                         'codec_private_data': track.get('CodecPrivateData'),
2023                         'sampling_rate': sampling_rate,
2024                         'channels': int_or_none(track.get('Channels', 2)),
2025                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2026                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2027                     },
2028                 })
2029         return formats
2030
2031     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2032         def absolute_url(video_url):
2033             return compat_urlparse.urljoin(base_url, video_url)
2034
2035         def parse_content_type(content_type):
2036             if not content_type:
2037                 return {}
2038             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2039             if ctr:
2040                 mimetype, codecs = ctr.groups()
2041                 f = parse_codecs(codecs)
2042                 f['ext'] = mimetype2ext(mimetype)
2043                 return f
2044             return {}
2045
2046         def _media_formats(src, cur_media_type):
2047             full_url = absolute_url(src)
2048             ext = determine_ext(full_url)
2049             if ext == 'm3u8':
2050                 is_plain_url = False
2051                 formats = self._extract_m3u8_formats(
2052                     full_url, video_id, ext='mp4',
2053                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2054                     preference=preference)
2055             elif ext == 'mpd':
2056                 is_plain_url = False
2057                 formats = self._extract_mpd_formats(
2058                     full_url, video_id, mpd_id=mpd_id)
2059             else:
2060                 is_plain_url = True
2061                 formats = [{
2062                     'url': full_url,
2063                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2064                 }]
2065             return is_plain_url, formats
2066
2067         entries = []
2068         media_tags = [(media_tag, media_type, '')
2069                       for media_tag, media_type
2070                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2071         media_tags.extend(re.findall(
2072             # We only allow video|audio followed by a whitespace or '>'.
2073             # Allowing more characters may end up in significant slow down (see
2074             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2075             # http://www.porntrex.com/maps/videositemap.xml).
2076             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2077         for media_tag, media_type, media_content in media_tags:
2078             media_info = {
2079                 'formats': [],
2080                 'subtitles': {},
2081             }
2082             media_attributes = extract_attributes(media_tag)
2083             src = media_attributes.get('src')
2084             if src:
2085                 _, formats = _media_formats(src, media_type)
2086                 media_info['formats'].extend(formats)
2087             media_info['thumbnail'] = media_attributes.get('poster')
2088             if media_content:
2089                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2090                     source_attributes = extract_attributes(source_tag)
2091                     src = source_attributes.get('src')
2092                     if not src:
2093                         continue
2094                     is_plain_url, formats = _media_formats(src, media_type)
2095                     if is_plain_url:
2096                         f = parse_content_type(source_attributes.get('type'))
2097                         f.update(formats[0])
2098                         media_info['formats'].append(f)
2099                     else:
2100                         media_info['formats'].extend(formats)
2101                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2102                     track_attributes = extract_attributes(track_tag)
2103                     kind = track_attributes.get('kind')
2104                     if not kind or kind in ('subtitles', 'captions'):
2105                         src = track_attributes.get('src')
2106                         if not src:
2107                             continue
2108                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2109                         media_info['subtitles'].setdefault(lang, []).append({
2110                             'url': absolute_url(src),
2111                         })
2112             if media_info['formats'] or media_info['subtitles']:
2113                 entries.append(media_info)
2114         return entries
2115
2116     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2117         formats = []
2118         hdcore_sign = 'hdcore=3.7.0'
2119         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2120         hds_host = hosts.get('hds')
2121         if hds_host:
2122             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2123         if 'hdcore=' not in f4m_url:
2124             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2125         f4m_formats = self._extract_f4m_formats(
2126             f4m_url, video_id, f4m_id='hds', fatal=False)
2127         for entry in f4m_formats:
2128             entry.update({'extra_param_to_segment_url': hdcore_sign})
2129         formats.extend(f4m_formats)
2130         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2131         hls_host = hosts.get('hls')
2132         if hls_host:
2133             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2134         formats.extend(self._extract_m3u8_formats(
2135             m3u8_url, video_id, 'mp4', 'm3u8_native',
2136             m3u8_id='hls', fatal=False))
2137         return formats
2138
2139     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2140         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2141         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2142         http_base_url = 'http' + url_base
2143         formats = []
2144         if 'm3u8' not in skip_protocols:
2145             formats.extend(self._extract_m3u8_formats(
2146                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2147                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2148         if 'f4m' not in skip_protocols:
2149             formats.extend(self._extract_f4m_formats(
2150                 http_base_url + '/manifest.f4m',
2151                 video_id, f4m_id='hds', fatal=False))
2152         if 'dash' not in skip_protocols:
2153             formats.extend(self._extract_mpd_formats(
2154                 http_base_url + '/manifest.mpd',
2155                 video_id, mpd_id='dash', fatal=False))
2156         if re.search(r'(?:/smil:|\.smil)', url_base):
2157             if 'smil' not in skip_protocols:
2158                 rtmp_formats = self._extract_smil_formats(
2159                     http_base_url + '/jwplayer.smil',
2160                     video_id, fatal=False)
2161                 for rtmp_format in rtmp_formats:
2162                     rtsp_format = rtmp_format.copy()
2163                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2164                     del rtsp_format['play_path']
2165                     del rtsp_format['ext']
2166                     rtsp_format.update({
2167                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2168                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2169                         'protocol': 'rtsp',
2170                     })
2171                     formats.extend([rtmp_format, rtsp_format])
2172         else:
2173             for protocol in ('rtmp', 'rtsp'):
2174                 if protocol not in skip_protocols:
2175                     formats.append({
2176                         'url': protocol + url_base,
2177                         'format_id': protocol,
2178                         'protocol': protocol,
2179                     })
2180         return formats
2181
2182     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2183         mobj = re.search(
2184             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2185             webpage)
2186         if mobj:
2187             try:
2188                 jwplayer_data = self._parse_json(mobj.group('options'),
2189                                                  video_id=video_id,
2190                                                  transform_source=transform_source)
2191             except ExtractorError:
2192                 pass
2193             else:
2194                 if isinstance(jwplayer_data, dict):
2195                     return jwplayer_data
2196
2197     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2198         jwplayer_data = self._find_jwplayer_data(
2199             webpage, video_id, transform_source=js_to_json)
2200         return self._parse_jwplayer_data(
2201             jwplayer_data, video_id, *args, **kwargs)
2202
2203     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2204                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2205         # JWPlayer backward compatibility: flattened playlists
2206         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2207         if 'playlist' not in jwplayer_data:
2208             jwplayer_data = {'playlist': [jwplayer_data]}
2209
2210         entries = []
2211
2212         # JWPlayer backward compatibility: single playlist item
2213         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2214         if not isinstance(jwplayer_data['playlist'], list):
2215             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2216
2217         for video_data in jwplayer_data['playlist']:
2218             # JWPlayer backward compatibility: flattened sources
2219             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2220             if 'sources' not in video_data:
2221                 video_data['sources'] = [video_data]
2222
2223             this_video_id = video_id or video_data['mediaid']
2224
2225             formats = self._parse_jwplayer_formats(
2226                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2227                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2228             self._sort_formats(formats)
2229
2230             subtitles = {}
2231             tracks = video_data.get('tracks')
2232             if tracks and isinstance(tracks, list):
2233                 for track in tracks:
2234                     if track.get('kind') != 'captions':
2235                         continue
2236                     track_url = urljoin(base_url, track.get('file'))
2237                     if not track_url:
2238                         continue
2239                     subtitles.setdefault(track.get('label') or 'en', []).append({
2240                         'url': self._proto_relative_url(track_url)
2241                     })
2242
2243             entries.append({
2244                 'id': this_video_id,
2245                 'title': video_data['title'] if require_title else video_data.get('title'),
2246                 'description': video_data.get('description'),
2247                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2248                 'timestamp': int_or_none(video_data.get('pubdate')),
2249                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2250                 'subtitles': subtitles,
2251                 'formats': formats,
2252             })
2253         if len(entries) == 1:
2254             return entries[0]
2255         else:
2256             return self.playlist_result(entries)
2257
2258     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2259                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2260         formats = []
2261         for source in jwplayer_sources_data:
2262             source_url = self._proto_relative_url(source['file'])
2263             if base_url:
2264                 source_url = compat_urlparse.urljoin(base_url, source_url)
2265             source_type = source.get('type') or ''
2266             ext = mimetype2ext(source_type) or determine_ext(source_url)
2267             if source_type == 'hls' or ext == 'm3u8':
2268                 formats.extend(self._extract_m3u8_formats(
2269                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2270                     m3u8_id=m3u8_id, fatal=False))
2271             elif ext == 'mpd':
2272                 formats.extend(self._extract_mpd_formats(
2273                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2274             elif ext == 'smil':
2275                 formats.extend(self._extract_smil_formats(
2276                     source_url, video_id, fatal=False))
2277             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2278             elif source_type.startswith('audio') or ext in (
2279                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2280                 formats.append({
2281                     'url': source_url,
2282                     'vcodec': 'none',
2283                     'ext': ext,
2284                 })
2285             else:
2286                 height = int_or_none(source.get('height'))
2287                 if height is None:
2288                     # Often no height is provided but there is a label in
2289                     # format like "1080p", "720p SD", or 1080.
2290                     height = int_or_none(self._search_regex(
2291                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2292                         'height', default=None))
2293                 a_format = {
2294                     'url': source_url,
2295                     'width': int_or_none(source.get('width')),
2296                     'height': height,
2297                     'tbr': int_or_none(source.get('bitrate')),
2298                     'ext': ext,
2299                 }
2300                 if source_url.startswith('rtmp'):
2301                     a_format['ext'] = 'flv'
2302                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2303                     # of jwplayer.flash.swf
2304                     rtmp_url_parts = re.split(
2305                         r'((?:mp4|mp3|flv):)', source_url, 1)
2306                     if len(rtmp_url_parts) == 3:
2307                         rtmp_url, prefix, play_path = rtmp_url_parts
2308                         a_format.update({
2309                             'url': rtmp_url,
2310                             'play_path': prefix + play_path,
2311                         })
2312                     if rtmp_params:
2313                         a_format.update(rtmp_params)
2314                 formats.append(a_format)
2315         return formats
2316
2317     def _live_title(self, name):
2318         """ Generate the title for a live video """
2319         now = datetime.datetime.now()
2320         now_str = now.strftime('%Y-%m-%d %H:%M')
2321         return name + ' ' + now_str
2322
2323     def _int(self, v, name, fatal=False, **kwargs):
2324         res = int_or_none(v, **kwargs)
2325         if 'get_attr' in kwargs:
2326             print(getattr(v, kwargs['get_attr']))
2327         if res is None:
2328             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2329             if fatal:
2330                 raise ExtractorError(msg)
2331             else:
2332                 self._downloader.report_warning(msg)
2333         return res
2334
2335     def _float(self, v, name, fatal=False, **kwargs):
2336         res = float_or_none(v, **kwargs)
2337         if res is None:
2338             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2339             if fatal:
2340                 raise ExtractorError(msg)
2341             else:
2342                 self._downloader.report_warning(msg)
2343         return res
2344
2345     def _set_cookie(self, domain, name, value, expire_time=None):
2346         cookie = compat_cookiejar.Cookie(
2347             0, name, value, None, None, domain, None,
2348             None, '/', True, False, expire_time, '', None, None, None)
2349         self._downloader.cookiejar.set_cookie(cookie)
2350
2351     def _get_cookies(self, url):
2352         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2353         req = sanitized_Request(url)
2354         self._downloader.cookiejar.add_cookie_header(req)
2355         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2356
2357     def get_testcases(self, include_onlymatching=False):
2358         t = getattr(self, '_TEST', None)
2359         if t:
2360             assert not hasattr(self, '_TESTS'), \
2361                 '%s has _TEST and _TESTS' % type(self).__name__
2362             tests = [t]
2363         else:
2364             tests = getattr(self, '_TESTS', [])
2365         for t in tests:
2366             if not include_onlymatching and t.get('only_matching', False):
2367                 continue
2368             t['name'] = type(self).__name__[:-len('IE')]
2369             yield t
2370
2371     def is_suitable(self, age_limit):
2372         """ Test whether the extractor is generally suitable for the given
2373         age limit (i.e. pornographic sites are not, all others usually are) """
2374
2375         any_restricted = False
2376         for tc in self.get_testcases(include_onlymatching=False):
2377             if tc.get('playlist', []):
2378                 tc = tc['playlist'][0]
2379             is_restricted = age_restricted(
2380                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2381             if not is_restricted:
2382                 return True
2383             any_restricted = any_restricted or is_restricted
2384         return not any_restricted
2385
2386     def extract_subtitles(self, *args, **kwargs):
2387         if (self._downloader.params.get('writesubtitles', False) or
2388                 self._downloader.params.get('listsubtitles')):
2389             return self._get_subtitles(*args, **kwargs)
2390         return {}
2391
2392     def _get_subtitles(self, *args, **kwargs):
2393         raise NotImplementedError('This method must be implemented by subclasses')
2394
2395     @staticmethod
2396     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2397         """ Merge subtitle items for one language. Items with duplicated URLs
2398         will be dropped. """
2399         list1_urls = set([item['url'] for item in subtitle_list1])
2400         ret = list(subtitle_list1)
2401         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2402         return ret
2403
2404     @classmethod
2405     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2406         """ Merge two subtitle dictionaries, language by language. """
2407         ret = dict(subtitle_dict1)
2408         for lang in subtitle_dict2:
2409             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2410         return ret
2411
2412     def extract_automatic_captions(self, *args, **kwargs):
2413         if (self._downloader.params.get('writeautomaticsub', False) or
2414                 self._downloader.params.get('listsubtitles')):
2415             return self._get_automatic_captions(*args, **kwargs)
2416         return {}
2417
2418     def _get_automatic_captions(self, *args, **kwargs):
2419         raise NotImplementedError('This method must be implemented by subclasses')
2420
2421     def mark_watched(self, *args, **kwargs):
2422         if (self._downloader.params.get('mark_watched', False) and
2423                 (self._get_login_info()[0] is not None or
2424                     self._downloader.params.get('cookiefile') is not None)):
2425             self._mark_watched(*args, **kwargs)
2426
2427     def _mark_watched(self, *args, **kwargs):
2428         raise NotImplementedError('This method must be implemented by subclasses')
2429
2430     def geo_verification_headers(self):
2431         headers = {}
2432         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2433         if geo_verification_proxy:
2434             headers['Ytdl-request-proxy'] = geo_verification_proxy
2435         return headers
2436
2437     def _generic_id(self, url):
2438         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2439
2440     def _generic_title(self, url):
2441         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2442
2443
2444 class SearchInfoExtractor(InfoExtractor):
2445     """
2446     Base class for paged search queries extractors.
2447     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2448     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2449     """
2450
2451     @classmethod
2452     def _make_valid_url(cls):
2453         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2454
2455     @classmethod
2456     def suitable(cls, url):
2457         return re.match(cls._make_valid_url(), url) is not None
2458
2459     def _real_extract(self, query):
2460         mobj = re.match(self._make_valid_url(), query)
2461         if mobj is None:
2462             raise ExtractorError('Invalid search query "%s"' % query)
2463
2464         prefix = mobj.group('prefix')
2465         query = mobj.group('query')
2466         if prefix == '':
2467             return self._get_n_results(query, 1)
2468         elif prefix == 'all':
2469             return self._get_n_results(query, self._MAX_RESULTS)
2470         else:
2471             n = int(prefix)
2472             if n <= 0:
2473                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2474             elif n > self._MAX_RESULTS:
2475                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2476                 n = self._MAX_RESULTS
2477             return self._get_n_results(query, n)
2478
2479     def _get_n_results(self, query, n):
2480         """Get a specified number of results for a query"""
2481         raise NotImplementedError('This method must be implemented by subclasses')
2482
2483     @property
2484     def SEARCH_KEY(self):
2485         return self._SEARCH_KEY