[extractor/common] Do not quit _initialize_geo_bypass on empty countries
[youtube-dl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import random
10 import re
11 import socket
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17     compat_cookiejar,
18     compat_cookies,
19     compat_etree_fromstring,
20     compat_getpass,
21     compat_http_client,
22     compat_os_name,
23     compat_str,
24     compat_urllib_error,
25     compat_urllib_parse_unquote,
26     compat_urllib_parse_urlencode,
27     compat_urllib_request,
28     compat_urlparse,
29 )
30 from ..downloader.f4m import remove_encrypted_media
31 from ..utils import (
32     NO_DEFAULT,
33     age_restricted,
34     base_url,
35     bug_reports_message,
36     clean_html,
37     compiled_regex_type,
38     determine_ext,
39     error_to_compat_str,
40     ExtractorError,
41     fix_xml_ampersands,
42     float_or_none,
43     GeoRestrictedError,
44     GeoUtils,
45     int_or_none,
46     js_to_json,
47     parse_iso8601,
48     RegexNotFoundError,
49     sanitize_filename,
50     sanitized_Request,
51     unescapeHTML,
52     unified_strdate,
53     unified_timestamp,
54     url_basename,
55     xpath_element,
56     xpath_text,
57     xpath_with_ns,
58     determine_protocol,
59     parse_duration,
60     mimetype2ext,
61     update_Request,
62     update_url_query,
63     parse_m3u8_attributes,
64     extract_attributes,
65     parse_codecs,
66     urljoin,
67 )
68
69
70 class InfoExtractor(object):
71     """Information Extractor class.
72
73     Information extractors are the classes that, given a URL, extract
74     information about the video (or videos) the URL refers to. This
75     information includes the real video URL, the video title, author and
76     others. The information is stored in a dictionary which is then
77     passed to the YoutubeDL. The YoutubeDL processes this
78     information possibly downloading the video to the file system, among
79     other possible outcomes.
80
81     The type field determines the type of the result.
82     By far the most common value (and the default if _type is missing) is
83     "video", which indicates a single video.
84
85     For a video, the dictionaries must include the following fields:
86
87     id:             Video identifier.
88     title:          Video title, unescaped.
89
90     Additionally, it must contain either a formats entry or a url one:
91
92     formats:        A list of dictionaries for each format available, ordered
93                     from worst to best quality.
94
95                     Potential fields:
96                     * url        Mandatory. The URL of the video file
97                     * manifest_url
98                                  The URL of the manifest file in case of
99                                  fragmented media (DASH, hls, hds)
100                     * ext        Will be calculated from URL if missing
101                     * format     A human-readable description of the format
102                                  ("mp4 container with h264/opus").
103                                  Calculated from the format_id, width, height.
104                                  and format_note fields if missing.
105                     * format_id  A short description of the format
106                                  ("mp4_h264_opus" or "19").
107                                 Technically optional, but strongly recommended.
108                     * format_note Additional info about the format
109                                  ("3D" or "DASH video")
110                     * width      Width of the video, if known
111                     * height     Height of the video, if known
112                     * resolution Textual description of width and height
113                     * tbr        Average bitrate of audio and video in KBit/s
114                     * abr        Average audio bitrate in KBit/s
115                     * acodec     Name of the audio codec in use
116                     * asr        Audio sampling rate in Hertz
117                     * vbr        Average video bitrate in KBit/s
118                     * fps        Frame rate
119                     * vcodec     Name of the video codec in use
120                     * container  Name of the container format
121                     * filesize   The number of bytes, if known in advance
122                     * filesize_approx  An estimate for the number of bytes
123                     * player_url SWF Player URL (used for rtmpdump).
124                     * protocol   The protocol that will be used for the actual
125                                  download, lower-case.
126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
127                                  "m3u8", "m3u8_native" or "http_dash_segments".
128                     * fragment_base_url
129                                  Base URL for fragments. Each fragment's path
130                                  value (if present) will be relative to
131                                  this URL.
132                     * fragments  A list of fragments of a fragmented media.
133                                  Each fragment entry must contain either an url
134                                  or a path. If an url is present it should be
135                                  considered by a client. Otherwise both path and
136                                  fragment_base_url must be present. Here is
137                                  the list of all potential fields:
138                                  * "url" - fragment's URL
139                                  * "path" - fragment's path relative to
140                                             fragment_base_url
141                                  * "duration" (optional, int or float)
142                                  * "filesize" (optional, int)
143                     * preference Order number of this format. If this field is
144                                  present and not None, the formats get sorted
145                                  by this field, regardless of all other values.
146                                  -1 for default (order by other properties),
147                                  -2 or smaller for less than default.
148                                  < -1000 to hide the format (if there is
149                                     another one which is strictly better)
150                     * language   Language code, e.g. "de" or "en-US".
151                     * language_preference  Is this in the language mentioned in
152                                  the URL?
153                                  10 if it's what the URL is about,
154                                  -1 for default (don't know),
155                                  -10 otherwise, other values reserved for now.
156                     * quality    Order number of the video quality of this
157                                  format, irrespective of the file format.
158                                  -1 for default (order by other properties),
159                                  -2 or smaller for less than default.
160                     * source_preference  Order number for this video source
161                                   (quality takes higher priority)
162                                  -1 for default (order by other properties),
163                                  -2 or smaller for less than default.
164                     * http_headers  A dictionary of additional HTTP headers
165                                  to add to the request.
166                     * stretched_ratio  If given and not 1, indicates that the
167                                  video's pixels are not square.
168                                  width : height ratio as float.
169                     * no_resume  The server does not support resuming the
170                                  (HTTP or RTMP) download. Boolean.
171
172     url:            Final video URL.
173     ext:            Video filename extension.
174     format:         The video format, defaults to ext (used for --get-format)
175     player_url:     SWF Player URL (used for rtmpdump).
176
177     The following fields are optional:
178
179     alt_title:      A secondary title of the video.
180     display_id      An alternative identifier for the video, not necessarily
181                     unique, but available before title. Typically, id is
182                     something like "4234987", title "Dancing naked mole rats",
183                     and display_id "dancing-naked-mole-rats"
184     thumbnails:     A list of dictionaries, with the following entries:
185                         * "id" (optional, string) - Thumbnail format ID
186                         * "url"
187                         * "preference" (optional, int) - quality of the image
188                         * "width" (optional, int)
189                         * "height" (optional, int)
190                         * "resolution" (optional, string "{width}x{height"},
191                                         deprecated)
192                         * "filesize" (optional, int)
193     thumbnail:      Full URL to a video thumbnail image.
194     description:    Full video description.
195     uploader:       Full name of the video uploader.
196     license:        License name the video is licensed under.
197     creator:        The creator of the video.
198     release_date:   The date (YYYYMMDD) when the video was released.
199     timestamp:      UNIX timestamp of the moment the video became available.
200     upload_date:    Video upload date (YYYYMMDD).
201                     If not explicitly set, calculated from timestamp.
202     uploader_id:    Nickname or id of the video uploader.
203     uploader_url:   Full URL to a personal webpage of the video uploader.
204     location:       Physical location where the video was filmed.
205     subtitles:      The available subtitles as a dictionary in the format
206                     {tag: subformats}. "tag" is usually a language code, and
207                     "subformats" is a list sorted from lower to higher
208                     preference, each element is a dictionary with the "ext"
209                     entry and one of:
210                         * "data": The subtitles file contents
211                         * "url": A URL pointing to the subtitles file
212                     "ext" will be calculated from URL if missing
213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
214                     automatically generated captions
215     duration:       Length of the video in seconds, as an integer or float.
216     view_count:     How many users have watched the video on the platform.
217     like_count:     Number of positive ratings of the video
218     dislike_count:  Number of negative ratings of the video
219     repost_count:   Number of reposts of the video
220     average_rating: Average rating give by users, the scale used depends on the webpage
221     comment_count:  Number of comments on the video
222     comments:       A list of comments, each with one or more of the following
223                     properties (all but one of text or html optional):
224                         * "author" - human-readable name of the comment author
225                         * "author_id" - user ID of the comment author
226                         * "id" - Comment ID
227                         * "html" - Comment as HTML
228                         * "text" - Plain text of the comment
229                         * "timestamp" - UNIX timestamp of comment
230                         * "parent" - ID of the comment this one is replying to.
231                                      Set to "root" to indicate that this is a
232                                      comment to the original video.
233     age_limit:      Age restriction for the video, as an integer (years)
234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
235                     should allow to get the same result again. (It will be set
236                     by YoutubeDL if it's missing)
237     categories:     A list of categories that the video falls in, for example
238                     ["Sports", "Berlin"]
239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240     is_live:        True, False, or None (=unknown). Whether this video is a
241                     live stream that goes on instead of a fixed-length video.
242     start_time:     Time in seconds where the reproduction should start, as
243                     specified in the URL.
244     end_time:       Time in seconds where the reproduction should end, as
245                     specified in the URL.
246
247     The following fields should only be used when the video belongs to some logical
248     chapter or section:
249
250     chapter:        Name or title of the chapter the video belongs to.
251     chapter_number: Number of the chapter the video belongs to, as an integer.
252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
253
254     The following fields should only be used when the video is an episode of some
255     series, programme or podcast:
256
257     series:         Title of the series or programme the video episode belongs to.
258     season:         Title of the season the video episode belongs to.
259     season_number:  Number of the season the video episode belongs to, as an integer.
260     season_id:      Id of the season the video episode belongs to, as a unicode string.
261     episode:        Title of the video episode. Unlike mandatory video title field,
262                     this field should denote the exact title of the video episode
263                     without any kind of decoration.
264     episode_number: Number of the video episode within a season, as an integer.
265     episode_id:     Id of the video episode, as a unicode string.
266
267     The following fields should only be used when the media is a track or a part of
268     a music album:
269
270     track:          Title of the track.
271     track_number:   Number of the track within an album or a disc, as an integer.
272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
273                     as a unicode string.
274     artist:         Artist(s) of the track.
275     genre:          Genre(s) of the track.
276     album:          Title of the album the track belongs to.
277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278     album_artist:   List of all artists appeared on the album (e.g.
279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
280                     and compilations).
281     disc_number:    Number of the disc or other physical medium the track belongs to,
282                     as an integer.
283     release_year:   Year (YYYY) when the album was released.
284
285     Unless mentioned otherwise, the fields should be Unicode strings.
286
287     Unless mentioned otherwise, None is equivalent to absence of information.
288
289
290     _type "playlist" indicates multiple videos.
291     There must be a key "entries", which is a list, an iterable, or a PagedList
292     object, each element of which is a valid dictionary by this specification.
293
294     Additionally, playlists can have "title", "description" and "id" attributes
295     with the same semantics as videos (see above).
296
297
298     _type "multi_video" indicates that there are multiple videos that
299     form a single show, for examples multiple acts of an opera or TV episode.
300     It must have an entries key like a playlist and contain all the keys
301     required for a video at the same time.
302
303
304     _type "url" indicates that the video must be extracted from another
305     location, possibly by a different extractor. Its only required key is:
306     "url" - the next URL to extract.
307     The key "ie_key" can be set to the class name (minus the trailing "IE",
308     e.g. "Youtube") if the extractor class is known in advance.
309     Additionally, the dictionary may have any properties of the resolved entity
310     known in advance, for example "title" if the title of the referred video is
311     known ahead of time.
312
313
314     _type "url_transparent" entities have the same specification as "url", but
315     indicate that the given additional information is more precise than the one
316     associated with the resolved URL.
317     This is useful when a site employs a video service that hosts the video and
318     its technical metadata, but that video service does not embed a useful
319     title, description etc.
320
321
322     Subclasses of this one should re-define the _real_initialize() and
323     _real_extract() methods and define a _VALID_URL regexp.
324     Probably, they should also be added to the list of extractors.
325
326     _GEO_BYPASS attribute may be set to False in order to disable
327     geo restriction bypass mechanisms for a particular extractor.
328     Though it won't disable explicit geo restriction bypass based on
329     country code provided with geo_bypass_country. (experimental)
330
331     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
332     countries for this extractor. One of these countries will be used by
333     geo restriction bypass mechanism right away in order to bypass
334     geo restriction, of course, if the mechanism is not disabled. (experimental)
335
336     NB: both these geo attributes are experimental and may change in future
337     or be completely removed.
338
339     Finally, the _WORKING attribute should be set to False for broken IEs
340     in order to warn the users and skip the tests.
341     """
342
343     _ready = False
344     _downloader = None
345     _x_forwarded_for_ip = None
346     _GEO_BYPASS = True
347     _GEO_COUNTRIES = None
348     _WORKING = True
349
350     def __init__(self, downloader=None):
351         """Constructor. Receives an optional downloader."""
352         self._ready = False
353         self._x_forwarded_for_ip = None
354         self.set_downloader(downloader)
355
356     @classmethod
357     def suitable(cls, url):
358         """Receives a URL and returns True if suitable for this IE."""
359
360         # This does not use has/getattr intentionally - we want to know whether
361         # we have cached the regexp for *this* class, whereas getattr would also
362         # match the superclass
363         if '_VALID_URL_RE' not in cls.__dict__:
364             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
365         return cls._VALID_URL_RE.match(url) is not None
366
367     @classmethod
368     def _match_id(cls, url):
369         if '_VALID_URL_RE' not in cls.__dict__:
370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
371         m = cls._VALID_URL_RE.match(url)
372         assert m
373         return m.group('id')
374
375     @classmethod
376     def working(cls):
377         """Getter method for _WORKING."""
378         return cls._WORKING
379
380     def initialize(self):
381         """Initializes an instance (authentication, etc)."""
382         self._initialize_geo_bypass(self._GEO_COUNTRIES)
383         if not self._ready:
384             self._real_initialize()
385             self._ready = True
386
387     def _initialize_geo_bypass(self, countries):
388         """
389         Initialize geo restriction bypass mechanism.
390
391         This method is used to initialize geo bypass mechanism based on faking
392         X-Forwarded-For HTTP header. A random country from provided country list
393         is selected and a random IP belonging to this country is generated. This
394         IP will be passed as X-Forwarded-For HTTP header in all subsequent
395         HTTP requests.
396
397         This method will be used for initial geo bypass mechanism initialization
398         during the instance initialization with _GEO_COUNTRIES.
399
400         You may also manually call it from extractor's code if geo countries
401         information is not available beforehand (e.g. obtained during
402         extraction) or due to some another reason.
403         """
404         if not self._x_forwarded_for_ip:
405             country_code = self._downloader.params.get('geo_bypass_country', None)
406             # If there is no explicit country for geo bypass specified and
407             # the extractor is known to be geo restricted let's fake IP
408             # as X-Forwarded-For right away.
409             if (not country_code and
410                     self._GEO_BYPASS and
411                     self._downloader.params.get('geo_bypass', True) and
412                     countries):
413                 country_code = random.choice(countries)
414             if country_code:
415                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
416                 if self._downloader.params.get('verbose', False):
417                     self._downloader.to_stdout(
418                         '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
419
420     def extract(self, url):
421         """Extracts URL information and returns it in list of dicts."""
422         try:
423             for _ in range(2):
424                 try:
425                     self.initialize()
426                     ie_result = self._real_extract(url)
427                     if self._x_forwarded_for_ip:
428                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
429                     return ie_result
430                 except GeoRestrictedError as e:
431                     if self.__maybe_fake_ip_and_retry(e.countries):
432                         continue
433                     raise
434         except ExtractorError:
435             raise
436         except compat_http_client.IncompleteRead as e:
437             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
438         except (KeyError, StopIteration) as e:
439             raise ExtractorError('An extractor error has occurred.', cause=e)
440
441     def __maybe_fake_ip_and_retry(self, countries):
442         if (not self._downloader.params.get('geo_bypass_country', None) and
443                 self._GEO_BYPASS and
444                 self._downloader.params.get('geo_bypass', True) and
445                 not self._x_forwarded_for_ip and
446                 countries):
447             self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
448             if self._x_forwarded_for_ip:
449                 self.report_warning(
450                     'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
451                 return True
452         return False
453
454     def set_downloader(self, downloader):
455         """Sets the downloader for this IE."""
456         self._downloader = downloader
457
458     def _real_initialize(self):
459         """Real initialization process. Redefine in subclasses."""
460         pass
461
462     def _real_extract(self, url):
463         """Real extraction process. Redefine in subclasses."""
464         pass
465
466     @classmethod
467     def ie_key(cls):
468         """A string for getting the InfoExtractor with get_info_extractor"""
469         return compat_str(cls.__name__[:-2])
470
471     @property
472     def IE_NAME(self):
473         return compat_str(type(self).__name__[:-2])
474
475     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
476         """ Returns the response handle """
477         if note is None:
478             self.report_download_webpage(video_id)
479         elif note is not False:
480             if video_id is None:
481                 self.to_screen('%s' % (note,))
482             else:
483                 self.to_screen('%s: %s' % (video_id, note))
484         if isinstance(url_or_request, compat_urllib_request.Request):
485             url_or_request = update_Request(
486                 url_or_request, data=data, headers=headers, query=query)
487         else:
488             if query:
489                 url_or_request = update_url_query(url_or_request, query)
490             if data is not None or headers:
491                 url_or_request = sanitized_Request(url_or_request, data, headers)
492         try:
493             return self._downloader.urlopen(url_or_request)
494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495             if errnote is False:
496                 return False
497             if errnote is None:
498                 errnote = 'Unable to download webpage'
499
500             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
501             if fatal:
502                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
503             else:
504                 self._downloader.report_warning(errmsg)
505                 return False
506
507     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
508         """ Returns a tuple (page content as string, URL handle) """
509         # Strip hashes from the URL (#1038)
510         if isinstance(url_or_request, (compat_str, str)):
511             url_or_request = url_or_request.partition('#')[0]
512
513         # Some sites check X-Forwarded-For HTTP header in order to figure out
514         # the origin of the client behind proxy. This allows bypassing geo
515         # restriction by faking this header's value to IP that belongs to some
516         # geo unrestricted country. We will do so once we encounter any
517         # geo restriction error.
518         if self._x_forwarded_for_ip:
519             if 'X-Forwarded-For' not in headers:
520                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
521
522         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
523         if urlh is False:
524             assert not fatal
525             return False
526         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
527         return (content, urlh)
528
529     @staticmethod
530     def _guess_encoding_from_content(content_type, webpage_bytes):
531         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
532         if m:
533             encoding = m.group(1)
534         else:
535             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
536                           webpage_bytes[:1024])
537             if m:
538                 encoding = m.group(1).decode('ascii')
539             elif webpage_bytes.startswith(b'\xff\xfe'):
540                 encoding = 'utf-16'
541             else:
542                 encoding = 'utf-8'
543
544         return encoding
545
546     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
547         content_type = urlh.headers.get('Content-Type', '')
548         webpage_bytes = urlh.read()
549         if prefix is not None:
550             webpage_bytes = prefix + webpage_bytes
551         if not encoding:
552             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
553         if self._downloader.params.get('dump_intermediate_pages', False):
554             try:
555                 url = url_or_request.get_full_url()
556             except AttributeError:
557                 url = url_or_request
558             self.to_screen('Dumping request to ' + url)
559             dump = base64.b64encode(webpage_bytes).decode('ascii')
560             self._downloader.to_screen(dump)
561         if self._downloader.params.get('write_pages', False):
562             try:
563                 url = url_or_request.get_full_url()
564             except AttributeError:
565                 url = url_or_request
566             basen = '%s_%s' % (video_id, url)
567             if len(basen) > 240:
568                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
569                 basen = basen[:240 - len(h)] + h
570             raw_filename = basen + '.dump'
571             filename = sanitize_filename(raw_filename, restricted=True)
572             self.to_screen('Saving request to ' + filename)
573             # Working around MAX_PATH limitation on Windows (see
574             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
575             if compat_os_name == 'nt':
576                 absfilepath = os.path.abspath(filename)
577                 if len(absfilepath) > 259:
578                     filename = '\\\\?\\' + absfilepath
579             with open(filename, 'wb') as outf:
580                 outf.write(webpage_bytes)
581
582         try:
583             content = webpage_bytes.decode(encoding, 'replace')
584         except LookupError:
585             content = webpage_bytes.decode('utf-8', 'replace')
586
587         if ('<title>Access to this site is blocked</title>' in content and
588                 'Websense' in content[:512]):
589             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
590             blocked_iframe = self._html_search_regex(
591                 r'<iframe src="([^"]+)"', content,
592                 'Websense information URL', default=None)
593             if blocked_iframe:
594                 msg += ' Visit %s for more details' % blocked_iframe
595             raise ExtractorError(msg, expected=True)
596         if '<title>The URL you requested has been blocked</title>' in content[:512]:
597             msg = (
598                 'Access to this webpage has been blocked by Indian censorship. '
599                 'Use a VPN or proxy server (with --proxy) to route around it.')
600             block_msg = self._html_search_regex(
601                 r'</h1><p>(.*?)</p>',
602                 content, 'block message', default=None)
603             if block_msg:
604                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
605             raise ExtractorError(msg, expected=True)
606
607         return content
608
609     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
610         """ Returns the data of the page as a string """
611         success = False
612         try_count = 0
613         while success is False:
614             try:
615                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
616                 success = True
617             except compat_http_client.IncompleteRead as e:
618                 try_count += 1
619                 if try_count >= tries:
620                     raise e
621                 self._sleep(timeout, video_id)
622         if res is False:
623             return res
624         else:
625             content, _ = res
626             return content
627
628     def _download_xml(self, url_or_request, video_id,
629                       note='Downloading XML', errnote='Unable to download XML',
630                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
631         """Return the xml as an xml.etree.ElementTree.Element"""
632         xml_string = self._download_webpage(
633             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
634         if xml_string is False:
635             return xml_string
636         if transform_source:
637             xml_string = transform_source(xml_string)
638         return compat_etree_fromstring(xml_string.encode('utf-8'))
639
640     def _download_json(self, url_or_request, video_id,
641                        note='Downloading JSON metadata',
642                        errnote='Unable to download JSON metadata',
643                        transform_source=None,
644                        fatal=True, encoding=None, data=None, headers={}, query={}):
645         json_string = self._download_webpage(
646             url_or_request, video_id, note, errnote, fatal=fatal,
647             encoding=encoding, data=data, headers=headers, query=query)
648         if (not fatal) and json_string is False:
649             return None
650         return self._parse_json(
651             json_string, video_id, transform_source=transform_source, fatal=fatal)
652
653     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
654         if transform_source:
655             json_string = transform_source(json_string)
656         try:
657             return json.loads(json_string)
658         except ValueError as ve:
659             errmsg = '%s: Failed to parse JSON ' % video_id
660             if fatal:
661                 raise ExtractorError(errmsg, cause=ve)
662             else:
663                 self.report_warning(errmsg + str(ve))
664
665     def report_warning(self, msg, video_id=None):
666         idstr = '' if video_id is None else '%s: ' % video_id
667         self._downloader.report_warning(
668             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
669
670     def to_screen(self, msg):
671         """Print msg to screen, prefixing it with '[ie_name]'"""
672         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
673
674     def report_extraction(self, id_or_name):
675         """Report information extraction."""
676         self.to_screen('%s: Extracting information' % id_or_name)
677
678     def report_download_webpage(self, video_id):
679         """Report webpage download."""
680         self.to_screen('%s: Downloading webpage' % video_id)
681
682     def report_age_confirmation(self):
683         """Report attempt to confirm age."""
684         self.to_screen('Confirming age')
685
686     def report_login(self):
687         """Report attempt to log in."""
688         self.to_screen('Logging in')
689
690     @staticmethod
691     def raise_login_required(msg='This video is only available for registered users'):
692         raise ExtractorError(
693             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
694             expected=True)
695
696     @staticmethod
697     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
698         raise GeoRestrictedError(msg, countries=countries)
699
700     # Methods for following #608
701     @staticmethod
702     def url_result(url, ie=None, video_id=None, video_title=None):
703         """Returns a URL that points to a page that should be processed"""
704         # TODO: ie should be the class used for getting the info
705         video_info = {'_type': 'url',
706                       'url': url,
707                       'ie_key': ie}
708         if video_id is not None:
709             video_info['id'] = video_id
710         if video_title is not None:
711             video_info['title'] = video_title
712         return video_info
713
714     @staticmethod
715     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
716         """Returns a playlist"""
717         video_info = {'_type': 'playlist',
718                       'entries': entries}
719         if playlist_id:
720             video_info['id'] = playlist_id
721         if playlist_title:
722             video_info['title'] = playlist_title
723         if playlist_description:
724             video_info['description'] = playlist_description
725         return video_info
726
727     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
728         """
729         Perform a regex search on the given string, using a single or a list of
730         patterns returning the first matching group.
731         In case of failure return a default value or raise a WARNING or a
732         RegexNotFoundError, depending on fatal, specifying the field name.
733         """
734         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
735             mobj = re.search(pattern, string, flags)
736         else:
737             for p in pattern:
738                 mobj = re.search(p, string, flags)
739                 if mobj:
740                     break
741
742         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
743             _name = '\033[0;34m%s\033[0m' % name
744         else:
745             _name = name
746
747         if mobj:
748             if group is None:
749                 # return the first matching group
750                 return next(g for g in mobj.groups() if g is not None)
751             else:
752                 return mobj.group(group)
753         elif default is not NO_DEFAULT:
754             return default
755         elif fatal:
756             raise RegexNotFoundError('Unable to extract %s' % _name)
757         else:
758             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
759             return None
760
761     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
762         """
763         Like _search_regex, but strips HTML tags and unescapes entities.
764         """
765         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
766         if res:
767             return clean_html(res).strip()
768         else:
769             return res
770
771     def _get_netrc_login_info(self, netrc_machine=None):
772         username = None
773         password = None
774         netrc_machine = netrc_machine or self._NETRC_MACHINE
775
776         if self._downloader.params.get('usenetrc', False):
777             try:
778                 info = netrc.netrc().authenticators(netrc_machine)
779                 if info is not None:
780                     username = info[0]
781                     password = info[2]
782                 else:
783                     raise netrc.NetrcParseError(
784                         'No authenticators for %s' % netrc_machine)
785             except (IOError, netrc.NetrcParseError) as err:
786                 self._downloader.report_warning(
787                     'parsing .netrc: %s' % error_to_compat_str(err))
788
789         return username, password
790
791     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
792         """
793         Get the login info as (username, password)
794         First look for the manually specified credentials using username_option
795         and password_option as keys in params dictionary. If no such credentials
796         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
797         value.
798         If there's no info available, return (None, None)
799         """
800         if self._downloader is None:
801             return (None, None)
802
803         downloader_params = self._downloader.params
804
805         # Attempt to use provided username and password or .netrc data
806         if downloader_params.get(username_option) is not None:
807             username = downloader_params[username_option]
808             password = downloader_params[password_option]
809         else:
810             username, password = self._get_netrc_login_info(netrc_machine)
811
812         return username, password
813
814     def _get_tfa_info(self, note='two-factor verification code'):
815         """
816         Get the two-factor authentication info
817         TODO - asking the user will be required for sms/phone verify
818         currently just uses the command line option
819         If there's no info available, return None
820         """
821         if self._downloader is None:
822             return None
823         downloader_params = self._downloader.params
824
825         if downloader_params.get('twofactor') is not None:
826             return downloader_params['twofactor']
827
828         return compat_getpass('Type %s and press [Return]: ' % note)
829
830     # Helper functions for extracting OpenGraph info
831     @staticmethod
832     def _og_regexes(prop):
833         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
834         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
835                        % {'prop': re.escape(prop)})
836         template = r'<meta[^>]+?%s[^>]+?%s'
837         return [
838             template % (property_re, content_re),
839             template % (content_re, property_re),
840         ]
841
842     @staticmethod
843     def _meta_regex(prop):
844         return r'''(?isx)<meta
845                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
846                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
847
848     def _og_search_property(self, prop, html, name=None, **kargs):
849         if not isinstance(prop, (list, tuple)):
850             prop = [prop]
851         if name is None:
852             name = 'OpenGraph %s' % prop[0]
853         og_regexes = []
854         for p in prop:
855             og_regexes.extend(self._og_regexes(p))
856         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
857         if escaped is None:
858             return None
859         return unescapeHTML(escaped)
860
861     def _og_search_thumbnail(self, html, **kargs):
862         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
863
864     def _og_search_description(self, html, **kargs):
865         return self._og_search_property('description', html, fatal=False, **kargs)
866
867     def _og_search_title(self, html, **kargs):
868         return self._og_search_property('title', html, **kargs)
869
870     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
871         regexes = self._og_regexes('video') + self._og_regexes('video:url')
872         if secure:
873             regexes = self._og_regexes('video:secure_url') + regexes
874         return self._html_search_regex(regexes, html, name, **kargs)
875
876     def _og_search_url(self, html, **kargs):
877         return self._og_search_property('url', html, **kargs)
878
879     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
880         if not isinstance(name, (list, tuple)):
881             name = [name]
882         if display_name is None:
883             display_name = name[0]
884         return self._html_search_regex(
885             [self._meta_regex(n) for n in name],
886             html, display_name, fatal=fatal, group='content', **kwargs)
887
888     def _dc_search_uploader(self, html):
889         return self._html_search_meta('dc.creator', html, 'uploader')
890
891     def _rta_search(self, html):
892         # See http://www.rtalabel.org/index.php?content=howtofaq#single
893         if re.search(r'(?ix)<meta\s+name="rating"\s+'
894                      r'     content="RTA-5042-1996-1400-1577-RTA"',
895                      html):
896             return 18
897         return 0
898
899     def _media_rating_search(self, html):
900         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
901         rating = self._html_search_meta('rating', html)
902
903         if not rating:
904             return None
905
906         RATING_TABLE = {
907             'safe for kids': 0,
908             'general': 8,
909             '14 years': 14,
910             'mature': 17,
911             'restricted': 19,
912         }
913         return RATING_TABLE.get(rating.lower())
914
915     def _family_friendly_search(self, html):
916         # See http://schema.org/VideoObject
917         family_friendly = self._html_search_meta('isFamilyFriendly', html)
918
919         if not family_friendly:
920             return None
921
922         RATING_TABLE = {
923             '1': 0,
924             'true': 0,
925             '0': 18,
926             'false': 18,
927         }
928         return RATING_TABLE.get(family_friendly.lower())
929
930     def _twitter_search_player(self, html):
931         return self._html_search_meta('twitter:player', html,
932                                       'twitter card player')
933
934     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
935         json_ld = self._search_regex(
936             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
937             html, 'JSON-LD', group='json_ld', **kwargs)
938         default = kwargs.get('default', NO_DEFAULT)
939         if not json_ld:
940             return default if default is not NO_DEFAULT else {}
941         # JSON-LD may be malformed and thus `fatal` should be respected.
942         # At the same time `default` may be passed that assumes `fatal=False`
943         # for _search_regex. Let's simulate the same behavior here as well.
944         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
945         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
946
947     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
948         if isinstance(json_ld, compat_str):
949             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
950         if not json_ld:
951             return {}
952         info = {}
953         if not isinstance(json_ld, (list, tuple, dict)):
954             return info
955         if isinstance(json_ld, dict):
956             json_ld = [json_ld]
957         for e in json_ld:
958             if e.get('@context') == 'http://schema.org':
959                 item_type = e.get('@type')
960                 if expected_type is not None and expected_type != item_type:
961                     return info
962                 if item_type == 'TVEpisode':
963                     info.update({
964                         'episode': unescapeHTML(e.get('name')),
965                         'episode_number': int_or_none(e.get('episodeNumber')),
966                         'description': unescapeHTML(e.get('description')),
967                     })
968                     part_of_season = e.get('partOfSeason')
969                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
970                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
971                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
972                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
973                         info['series'] = unescapeHTML(part_of_series.get('name'))
974                 elif item_type == 'Article':
975                     info.update({
976                         'timestamp': parse_iso8601(e.get('datePublished')),
977                         'title': unescapeHTML(e.get('headline')),
978                         'description': unescapeHTML(e.get('articleBody')),
979                     })
980                 elif item_type == 'VideoObject':
981                     info.update({
982                         'url': e.get('contentUrl'),
983                         'title': unescapeHTML(e.get('name')),
984                         'description': unescapeHTML(e.get('description')),
985                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
986                         'duration': parse_duration(e.get('duration')),
987                         'timestamp': unified_timestamp(e.get('uploadDate')),
988                         'filesize': float_or_none(e.get('contentSize')),
989                         'tbr': int_or_none(e.get('bitrate')),
990                         'width': int_or_none(e.get('width')),
991                         'height': int_or_none(e.get('height')),
992                     })
993                 break
994         return dict((k, v) for k, v in info.items() if v is not None)
995
996     @staticmethod
997     def _hidden_inputs(html):
998         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
999         hidden_inputs = {}
1000         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1001             attrs = extract_attributes(input)
1002             if not input:
1003                 continue
1004             if attrs.get('type') not in ('hidden', 'submit'):
1005                 continue
1006             name = attrs.get('name') or attrs.get('id')
1007             value = attrs.get('value')
1008             if name and value is not None:
1009                 hidden_inputs[name] = value
1010         return hidden_inputs
1011
1012     def _form_hidden_inputs(self, form_id, html):
1013         form = self._search_regex(
1014             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1015             html, '%s form' % form_id, group='form')
1016         return self._hidden_inputs(form)
1017
1018     def _sort_formats(self, formats, field_preference=None):
1019         if not formats:
1020             raise ExtractorError('No video formats found')
1021
1022         for f in formats:
1023             # Automatically determine tbr when missing based on abr and vbr (improves
1024             # formats sorting in some cases)
1025             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1026                 f['tbr'] = f['abr'] + f['vbr']
1027
1028         def _formats_key(f):
1029             # TODO remove the following workaround
1030             from ..utils import determine_ext
1031             if not f.get('ext') and 'url' in f:
1032                 f['ext'] = determine_ext(f['url'])
1033
1034             if isinstance(field_preference, (list, tuple)):
1035                 return tuple(
1036                     f.get(field)
1037                     if f.get(field) is not None
1038                     else ('' if field == 'format_id' else -1)
1039                     for field in field_preference)
1040
1041             preference = f.get('preference')
1042             if preference is None:
1043                 preference = 0
1044                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1045                     preference -= 0.5
1046
1047             protocol = f.get('protocol') or determine_protocol(f)
1048             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1049
1050             if f.get('vcodec') == 'none':  # audio only
1051                 preference -= 50
1052                 if self._downloader.params.get('prefer_free_formats'):
1053                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1054                 else:
1055                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1056                 ext_preference = 0
1057                 try:
1058                     audio_ext_preference = ORDER.index(f['ext'])
1059                 except ValueError:
1060                     audio_ext_preference = -1
1061             else:
1062                 if f.get('acodec') == 'none':  # video only
1063                     preference -= 40
1064                 if self._downloader.params.get('prefer_free_formats'):
1065                     ORDER = ['flv', 'mp4', 'webm']
1066                 else:
1067                     ORDER = ['webm', 'flv', 'mp4']
1068                 try:
1069                     ext_preference = ORDER.index(f['ext'])
1070                 except ValueError:
1071                     ext_preference = -1
1072                 audio_ext_preference = 0
1073
1074             return (
1075                 preference,
1076                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1077                 f.get('quality') if f.get('quality') is not None else -1,
1078                 f.get('tbr') if f.get('tbr') is not None else -1,
1079                 f.get('filesize') if f.get('filesize') is not None else -1,
1080                 f.get('vbr') if f.get('vbr') is not None else -1,
1081                 f.get('height') if f.get('height') is not None else -1,
1082                 f.get('width') if f.get('width') is not None else -1,
1083                 proto_preference,
1084                 ext_preference,
1085                 f.get('abr') if f.get('abr') is not None else -1,
1086                 audio_ext_preference,
1087                 f.get('fps') if f.get('fps') is not None else -1,
1088                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1089                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1090                 f.get('format_id') if f.get('format_id') is not None else '',
1091             )
1092         formats.sort(key=_formats_key)
1093
1094     def _check_formats(self, formats, video_id):
1095         if formats:
1096             formats[:] = filter(
1097                 lambda f: self._is_valid_url(
1098                     f['url'], video_id,
1099                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1100                 formats)
1101
1102     @staticmethod
1103     def _remove_duplicate_formats(formats):
1104         format_urls = set()
1105         unique_formats = []
1106         for f in formats:
1107             if f['url'] not in format_urls:
1108                 format_urls.add(f['url'])
1109                 unique_formats.append(f)
1110         formats[:] = unique_formats
1111
1112     def _is_valid_url(self, url, video_id, item='video', headers={}):
1113         url = self._proto_relative_url(url, scheme='http:')
1114         # For now assume non HTTP(S) URLs always valid
1115         if not (url.startswith('http://') or url.startswith('https://')):
1116             return True
1117         try:
1118             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1119             return True
1120         except ExtractorError as e:
1121             if isinstance(e.cause, compat_urllib_error.URLError):
1122                 self.to_screen(
1123                     '%s: %s URL is invalid, skipping' % (video_id, item))
1124                 return False
1125             raise
1126
1127     def http_scheme(self):
1128         """ Either "http:" or "https:", depending on the user's preferences """
1129         return (
1130             'http:'
1131             if self._downloader.params.get('prefer_insecure', False)
1132             else 'https:')
1133
1134     def _proto_relative_url(self, url, scheme=None):
1135         if url is None:
1136             return url
1137         if url.startswith('//'):
1138             if scheme is None:
1139                 scheme = self.http_scheme()
1140             return scheme + url
1141         else:
1142             return url
1143
1144     def _sleep(self, timeout, video_id, msg_template=None):
1145         if msg_template is None:
1146             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1147         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1148         self.to_screen(msg)
1149         time.sleep(timeout)
1150
1151     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1152                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1153                              fatal=True, m3u8_id=None):
1154         manifest = self._download_xml(
1155             manifest_url, video_id, 'Downloading f4m manifest',
1156             'Unable to download f4m manifest',
1157             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1158             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1159             transform_source=transform_source,
1160             fatal=fatal)
1161
1162         if manifest is False:
1163             return []
1164
1165         return self._parse_f4m_formats(
1166             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1167             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1168
1169     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1170                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1171                            fatal=True, m3u8_id=None):
1172         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1173         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1174         if akamai_pv is not None and ';' in akamai_pv.text:
1175             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1176             if playerVerificationChallenge.strip() != '':
1177                 return []
1178
1179         formats = []
1180         manifest_version = '1.0'
1181         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1182         if not media_nodes:
1183             manifest_version = '2.0'
1184             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1185         # Remove unsupported DRM protected media from final formats
1186         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1187         media_nodes = remove_encrypted_media(media_nodes)
1188         if not media_nodes:
1189             return formats
1190         base_url = xpath_text(
1191             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1192             'base URL', default=None)
1193         if base_url:
1194             base_url = base_url.strip()
1195
1196         bootstrap_info = xpath_element(
1197             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1198             'bootstrap info', default=None)
1199
1200         vcodec = None
1201         mime_type = xpath_text(
1202             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1203             'base URL', default=None)
1204         if mime_type and mime_type.startswith('audio/'):
1205             vcodec = 'none'
1206
1207         for i, media_el in enumerate(media_nodes):
1208             tbr = int_or_none(media_el.attrib.get('bitrate'))
1209             width = int_or_none(media_el.attrib.get('width'))
1210             height = int_or_none(media_el.attrib.get('height'))
1211             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1212             # If <bootstrapInfo> is present, the specified f4m is a
1213             # stream-level manifest, and only set-level manifests may refer to
1214             # external resources.  See section 11.4 and section 4 of F4M spec
1215             if bootstrap_info is None:
1216                 media_url = None
1217                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1218                 if manifest_version == '2.0':
1219                     media_url = media_el.attrib.get('href')
1220                 if media_url is None:
1221                     media_url = media_el.attrib.get('url')
1222                 if not media_url:
1223                     continue
1224                 manifest_url = (
1225                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1226                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1227                 # If media_url is itself a f4m manifest do the recursive extraction
1228                 # since bitrates in parent manifest (this one) and media_url manifest
1229                 # may differ leading to inability to resolve the format by requested
1230                 # bitrate in f4m downloader
1231                 ext = determine_ext(manifest_url)
1232                 if ext == 'f4m':
1233                     f4m_formats = self._extract_f4m_formats(
1234                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1235                         transform_source=transform_source, fatal=fatal)
1236                     # Sometimes stream-level manifest contains single media entry that
1237                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1238                     # At the same time parent's media entry in set-level manifest may
1239                     # contain it. We will copy it from parent in such cases.
1240                     if len(f4m_formats) == 1:
1241                         f = f4m_formats[0]
1242                         f.update({
1243                             'tbr': f.get('tbr') or tbr,
1244                             'width': f.get('width') or width,
1245                             'height': f.get('height') or height,
1246                             'format_id': f.get('format_id') if not tbr else format_id,
1247                             'vcodec': vcodec,
1248                         })
1249                     formats.extend(f4m_formats)
1250                     continue
1251                 elif ext == 'm3u8':
1252                     formats.extend(self._extract_m3u8_formats(
1253                         manifest_url, video_id, 'mp4', preference=preference,
1254                         m3u8_id=m3u8_id, fatal=fatal))
1255                     continue
1256             formats.append({
1257                 'format_id': format_id,
1258                 'url': manifest_url,
1259                 'manifest_url': manifest_url,
1260                 'ext': 'flv' if bootstrap_info is not None else None,
1261                 'tbr': tbr,
1262                 'width': width,
1263                 'height': height,
1264                 'vcodec': vcodec,
1265                 'preference': preference,
1266             })
1267         return formats
1268
1269     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1270         return {
1271             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1272             'url': m3u8_url,
1273             'ext': ext,
1274             'protocol': 'm3u8',
1275             'preference': preference - 100 if preference else -100,
1276             'resolution': 'multiple',
1277             'format_note': 'Quality selection URL',
1278         }
1279
1280     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1281                               entry_protocol='m3u8', preference=None,
1282                               m3u8_id=None, note=None, errnote=None,
1283                               fatal=True, live=False):
1284
1285         res = self._download_webpage_handle(
1286             m3u8_url, video_id,
1287             note=note or 'Downloading m3u8 information',
1288             errnote=errnote or 'Failed to download m3u8 information',
1289             fatal=fatal)
1290         if res is False:
1291             return []
1292         m3u8_doc, urlh = res
1293         m3u8_url = urlh.geturl()
1294
1295         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1296             return []
1297
1298         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1299
1300         format_url = lambda u: (
1301             u
1302             if re.match(r'^https?://', u)
1303             else compat_urlparse.urljoin(m3u8_url, u))
1304
1305         # We should try extracting formats only from master playlists [1], i.e.
1306         # playlists that describe available qualities. On the other hand media
1307         # playlists [2] should be returned as is since they contain just the media
1308         # without qualities renditions.
1309         # Fortunately, master playlist can be easily distinguished from media
1310         # playlist based on particular tags availability. As of [1, 2] master
1311         # playlist tags MUST NOT appear in a media playist and vice versa.
1312         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1313         # and MUST NOT appear in master playlist thus we can clearly detect media
1314         # playlist with this criterion.
1315         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1316         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1317         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1318         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1319             return [{
1320                 'url': m3u8_url,
1321                 'format_id': m3u8_id,
1322                 'ext': ext,
1323                 'protocol': entry_protocol,
1324                 'preference': preference,
1325             }]
1326         audio_in_video_stream = {}
1327         last_info = {}
1328         last_media = {}
1329         for line in m3u8_doc.splitlines():
1330             if line.startswith('#EXT-X-STREAM-INF:'):
1331                 last_info = parse_m3u8_attributes(line)
1332             elif line.startswith('#EXT-X-MEDIA:'):
1333                 media = parse_m3u8_attributes(line)
1334                 media_type = media.get('TYPE')
1335                 if media_type in ('VIDEO', 'AUDIO'):
1336                     group_id = media.get('GROUP-ID')
1337                     media_url = media.get('URI')
1338                     if media_url:
1339                         format_id = []
1340                         for v in (group_id, media.get('NAME')):
1341                             if v:
1342                                 format_id.append(v)
1343                         f = {
1344                             'format_id': '-'.join(format_id),
1345                             'url': format_url(media_url),
1346                             'language': media.get('LANGUAGE'),
1347                             'ext': ext,
1348                             'protocol': entry_protocol,
1349                             'preference': preference,
1350                         }
1351                         if media_type == 'AUDIO':
1352                             f['vcodec'] = 'none'
1353                             if group_id and not audio_in_video_stream.get(group_id):
1354                                 audio_in_video_stream[group_id] = False
1355                         formats.append(f)
1356                     else:
1357                         # When there is no URI in EXT-X-MEDIA let this tag's
1358                         # data be used by regular URI lines below
1359                         last_media = media
1360                         if media_type == 'AUDIO' and group_id:
1361                             audio_in_video_stream[group_id] = True
1362             elif line.startswith('#') or not line.strip():
1363                 continue
1364             else:
1365                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1366                 format_id = []
1367                 if m3u8_id:
1368                     format_id.append(m3u8_id)
1369                 # Despite specification does not mention NAME attribute for
1370                 # EXT-X-STREAM-INF it still sometimes may be present
1371                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1372                 # Bandwidth of live streams may differ over time thus making
1373                 # format_id unpredictable. So it's better to keep provided
1374                 # format_id intact.
1375                 if not live:
1376                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1377                 manifest_url = format_url(line.strip())
1378                 f = {
1379                     'format_id': '-'.join(format_id),
1380                     'url': manifest_url,
1381                     'manifest_url': manifest_url,
1382                     'tbr': tbr,
1383                     'ext': ext,
1384                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1385                     'protocol': entry_protocol,
1386                     'preference': preference,
1387                 }
1388                 resolution = last_info.get('RESOLUTION')
1389                 if resolution:
1390                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1391                     if mobj:
1392                         f['width'] = int(mobj.group('width'))
1393                         f['height'] = int(mobj.group('height'))
1394                 # Unified Streaming Platform
1395                 mobj = re.search(
1396                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1397                 if mobj:
1398                     abr, vbr = mobj.groups()
1399                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1400                     f.update({
1401                         'vbr': vbr,
1402                         'abr': abr,
1403                     })
1404                 f.update(parse_codecs(last_info.get('CODECS')))
1405                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1406                     # TODO: update acodec for audio only formats with the same GROUP-ID
1407                     f['acodec'] = 'none'
1408                 formats.append(f)
1409                 last_info = {}
1410                 last_media = {}
1411         return formats
1412
1413     @staticmethod
1414     def _xpath_ns(path, namespace=None):
1415         if not namespace:
1416             return path
1417         out = []
1418         for c in path.split('/'):
1419             if not c or c == '.':
1420                 out.append(c)
1421             else:
1422                 out.append('{%s}%s' % (namespace, c))
1423         return '/'.join(out)
1424
1425     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1426         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1427
1428         if smil is False:
1429             assert not fatal
1430             return []
1431
1432         namespace = self._parse_smil_namespace(smil)
1433
1434         return self._parse_smil_formats(
1435             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1436
1437     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1438         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1439         if smil is False:
1440             return {}
1441         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1442
1443     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1444         return self._download_xml(
1445             smil_url, video_id, 'Downloading SMIL file',
1446             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1447
1448     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1449         namespace = self._parse_smil_namespace(smil)
1450
1451         formats = self._parse_smil_formats(
1452             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1453         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1454
1455         video_id = os.path.splitext(url_basename(smil_url))[0]
1456         title = None
1457         description = None
1458         upload_date = None
1459         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1460             name = meta.attrib.get('name')
1461             content = meta.attrib.get('content')
1462             if not name or not content:
1463                 continue
1464             if not title and name == 'title':
1465                 title = content
1466             elif not description and name in ('description', 'abstract'):
1467                 description = content
1468             elif not upload_date and name == 'date':
1469                 upload_date = unified_strdate(content)
1470
1471         thumbnails = [{
1472             'id': image.get('type'),
1473             'url': image.get('src'),
1474             'width': int_or_none(image.get('width')),
1475             'height': int_or_none(image.get('height')),
1476         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1477
1478         return {
1479             'id': video_id,
1480             'title': title or video_id,
1481             'description': description,
1482             'upload_date': upload_date,
1483             'thumbnails': thumbnails,
1484             'formats': formats,
1485             'subtitles': subtitles,
1486         }
1487
1488     def _parse_smil_namespace(self, smil):
1489         return self._search_regex(
1490             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1491
1492     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1493         base = smil_url
1494         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1495             b = meta.get('base') or meta.get('httpBase')
1496             if b:
1497                 base = b
1498                 break
1499
1500         formats = []
1501         rtmp_count = 0
1502         http_count = 0
1503         m3u8_count = 0
1504
1505         srcs = []
1506         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1507         for medium in media:
1508             src = medium.get('src')
1509             if not src or src in srcs:
1510                 continue
1511             srcs.append(src)
1512
1513             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1514             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1515             width = int_or_none(medium.get('width'))
1516             height = int_or_none(medium.get('height'))
1517             proto = medium.get('proto')
1518             ext = medium.get('ext')
1519             src_ext = determine_ext(src)
1520             streamer = medium.get('streamer') or base
1521
1522             if proto == 'rtmp' or streamer.startswith('rtmp'):
1523                 rtmp_count += 1
1524                 formats.append({
1525                     'url': streamer,
1526                     'play_path': src,
1527                     'ext': 'flv',
1528                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1529                     'tbr': bitrate,
1530                     'filesize': filesize,
1531                     'width': width,
1532                     'height': height,
1533                 })
1534                 if transform_rtmp_url:
1535                     streamer, src = transform_rtmp_url(streamer, src)
1536                     formats[-1].update({
1537                         'url': streamer,
1538                         'play_path': src,
1539                     })
1540                 continue
1541
1542             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1543             src_url = src_url.strip()
1544
1545             if proto == 'm3u8' or src_ext == 'm3u8':
1546                 m3u8_formats = self._extract_m3u8_formats(
1547                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1548                 if len(m3u8_formats) == 1:
1549                     m3u8_count += 1
1550                     m3u8_formats[0].update({
1551                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1552                         'tbr': bitrate,
1553                         'width': width,
1554                         'height': height,
1555                     })
1556                 formats.extend(m3u8_formats)
1557                 continue
1558
1559             if src_ext == 'f4m':
1560                 f4m_url = src_url
1561                 if not f4m_params:
1562                     f4m_params = {
1563                         'hdcore': '3.2.0',
1564                         'plugin': 'flowplayer-3.2.0.1',
1565                     }
1566                 f4m_url += '&' if '?' in f4m_url else '?'
1567                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1568                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1569                 continue
1570
1571             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1572                 http_count += 1
1573                 formats.append({
1574                     'url': src_url,
1575                     'ext': ext or src_ext or 'flv',
1576                     'format_id': 'http-%d' % (bitrate or http_count),
1577                     'tbr': bitrate,
1578                     'filesize': filesize,
1579                     'width': width,
1580                     'height': height,
1581                 })
1582                 continue
1583
1584         return formats
1585
1586     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1587         urls = []
1588         subtitles = {}
1589         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1590             src = textstream.get('src')
1591             if not src or src in urls:
1592                 continue
1593             urls.append(src)
1594             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1595             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1596             subtitles.setdefault(lang, []).append({
1597                 'url': src,
1598                 'ext': ext,
1599             })
1600         return subtitles
1601
1602     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1603         xspf = self._download_xml(
1604             playlist_url, playlist_id, 'Downloading xpsf playlist',
1605             'Unable to download xspf manifest', fatal=fatal)
1606         if xspf is False:
1607             return []
1608         return self._parse_xspf(xspf, playlist_id)
1609
1610     def _parse_xspf(self, playlist, playlist_id):
1611         NS_MAP = {
1612             'xspf': 'http://xspf.org/ns/0/',
1613             's1': 'http://static.streamone.nl/player/ns/0',
1614         }
1615
1616         entries = []
1617         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1618             title = xpath_text(
1619                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1620             description = xpath_text(
1621                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1622             thumbnail = xpath_text(
1623                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1624             duration = float_or_none(
1625                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1626
1627             formats = [{
1628                 'url': location.text,
1629                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1630                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1631                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1632             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1633             self._sort_formats(formats)
1634
1635             entries.append({
1636                 'id': playlist_id,
1637                 'title': title,
1638                 'description': description,
1639                 'thumbnail': thumbnail,
1640                 'duration': duration,
1641                 'formats': formats,
1642             })
1643         return entries
1644
1645     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1646         res = self._download_webpage_handle(
1647             mpd_url, video_id,
1648             note=note or 'Downloading MPD manifest',
1649             errnote=errnote or 'Failed to download MPD manifest',
1650             fatal=fatal)
1651         if res is False:
1652             return []
1653         mpd, urlh = res
1654         mpd_base_url = base_url(urlh.geturl())
1655
1656         return self._parse_mpd_formats(
1657             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1658             formats_dict=formats_dict, mpd_url=mpd_url)
1659
1660     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1661         """
1662         Parse formats from MPD manifest.
1663         References:
1664          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1665             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1666          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1667         """
1668         if mpd_doc.get('type') == 'dynamic':
1669             return []
1670
1671         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1672
1673         def _add_ns(path):
1674             return self._xpath_ns(path, namespace)
1675
1676         def is_drm_protected(element):
1677             return element.find(_add_ns('ContentProtection')) is not None
1678
1679         def extract_multisegment_info(element, ms_parent_info):
1680             ms_info = ms_parent_info.copy()
1681
1682             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1683             # common attributes and elements.  We will only extract relevant
1684             # for us.
1685             def extract_common(source):
1686                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1687                 if segment_timeline is not None:
1688                     s_e = segment_timeline.findall(_add_ns('S'))
1689                     if s_e:
1690                         ms_info['total_number'] = 0
1691                         ms_info['s'] = []
1692                         for s in s_e:
1693                             r = int(s.get('r', 0))
1694                             ms_info['total_number'] += 1 + r
1695                             ms_info['s'].append({
1696                                 't': int(s.get('t', 0)),
1697                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1698                                 'd': int(s.attrib['d']),
1699                                 'r': r,
1700                             })
1701                 start_number = source.get('startNumber')
1702                 if start_number:
1703                     ms_info['start_number'] = int(start_number)
1704                 timescale = source.get('timescale')
1705                 if timescale:
1706                     ms_info['timescale'] = int(timescale)
1707                 segment_duration = source.get('duration')
1708                 if segment_duration:
1709                     ms_info['segment_duration'] = int(segment_duration)
1710
1711             def extract_Initialization(source):
1712                 initialization = source.find(_add_ns('Initialization'))
1713                 if initialization is not None:
1714                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1715
1716             segment_list = element.find(_add_ns('SegmentList'))
1717             if segment_list is not None:
1718                 extract_common(segment_list)
1719                 extract_Initialization(segment_list)
1720                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1721                 if segment_urls_e:
1722                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1723             else:
1724                 segment_template = element.find(_add_ns('SegmentTemplate'))
1725                 if segment_template is not None:
1726                     extract_common(segment_template)
1727                     media = segment_template.get('media')
1728                     if media:
1729                         ms_info['media'] = media
1730                     initialization = segment_template.get('initialization')
1731                     if initialization:
1732                         ms_info['initialization'] = initialization
1733                     else:
1734                         extract_Initialization(segment_template)
1735             return ms_info
1736
1737         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1738         formats = []
1739         for period in mpd_doc.findall(_add_ns('Period')):
1740             period_duration = parse_duration(period.get('duration')) or mpd_duration
1741             period_ms_info = extract_multisegment_info(period, {
1742                 'start_number': 1,
1743                 'timescale': 1,
1744             })
1745             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1746                 if is_drm_protected(adaptation_set):
1747                     continue
1748                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1749                 for representation in adaptation_set.findall(_add_ns('Representation')):
1750                     if is_drm_protected(representation):
1751                         continue
1752                     representation_attrib = adaptation_set.attrib.copy()
1753                     representation_attrib.update(representation.attrib)
1754                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1755                     mime_type = representation_attrib['mimeType']
1756                     content_type = mime_type.split('/')[0]
1757                     if content_type == 'text':
1758                         # TODO implement WebVTT downloading
1759                         pass
1760                     elif content_type == 'video' or content_type == 'audio':
1761                         base_url = ''
1762                         for element in (representation, adaptation_set, period, mpd_doc):
1763                             base_url_e = element.find(_add_ns('BaseURL'))
1764                             if base_url_e is not None:
1765                                 base_url = base_url_e.text + base_url
1766                                 if re.match(r'^https?://', base_url):
1767                                     break
1768                         if mpd_base_url and not re.match(r'^https?://', base_url):
1769                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1770                                 mpd_base_url += '/'
1771                             base_url = mpd_base_url + base_url
1772                         representation_id = representation_attrib.get('id')
1773                         lang = representation_attrib.get('lang')
1774                         url_el = representation.find(_add_ns('BaseURL'))
1775                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1776                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1777                         f = {
1778                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1779                             'url': base_url,
1780                             'manifest_url': mpd_url,
1781                             'ext': mimetype2ext(mime_type),
1782                             'width': int_or_none(representation_attrib.get('width')),
1783                             'height': int_or_none(representation_attrib.get('height')),
1784                             'tbr': int_or_none(bandwidth, 1000),
1785                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1786                             'fps': int_or_none(representation_attrib.get('frameRate')),
1787                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1788                             'format_note': 'DASH %s' % content_type,
1789                             'filesize': filesize,
1790                         }
1791                         f.update(parse_codecs(representation_attrib.get('codecs')))
1792                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1793
1794                         def prepare_template(template_name, identifiers):
1795                             t = representation_ms_info[template_name]
1796                             t = t.replace('$RepresentationID$', representation_id)
1797                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1798                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1799                             t.replace('$$', '$')
1800                             return t
1801
1802                         # @initialization is a regular template like @media one
1803                         # so it should be handled just the same way (see
1804                         # https://github.com/rg3/youtube-dl/issues/11605)
1805                         if 'initialization' in representation_ms_info:
1806                             initialization_template = prepare_template(
1807                                 'initialization',
1808                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1809                                 # $Time$ shall not be included for @initialization thus
1810                                 # only $Bandwidth$ remains
1811                                 ('Bandwidth', ))
1812                             representation_ms_info['initialization_url'] = initialization_template % {
1813                                 'Bandwidth': bandwidth,
1814                             }
1815
1816                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1817
1818                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1819
1820                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1821                             # can't be used at the same time
1822                             if '%(Number' in media_template and 's' not in representation_ms_info:
1823                                 segment_duration = None
1824                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1825                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1826                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1827                                 representation_ms_info['fragments'] = [{
1828                                     'url': media_template % {
1829                                         'Number': segment_number,
1830                                         'Bandwidth': bandwidth,
1831                                     },
1832                                     'duration': segment_duration,
1833                                 } for segment_number in range(
1834                                     representation_ms_info['start_number'],
1835                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1836                             else:
1837                                 # $Number*$ or $Time$ in media template with S list available
1838                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1839                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1840                                 representation_ms_info['fragments'] = []
1841                                 segment_time = 0
1842                                 segment_d = None
1843                                 segment_number = representation_ms_info['start_number']
1844
1845                                 def add_segment_url():
1846                                     segment_url = media_template % {
1847                                         'Time': segment_time,
1848                                         'Bandwidth': bandwidth,
1849                                         'Number': segment_number,
1850                                     }
1851                                     representation_ms_info['fragments'].append({
1852                                         'url': segment_url,
1853                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1854                                     })
1855
1856                                 for num, s in enumerate(representation_ms_info['s']):
1857                                     segment_time = s.get('t') or segment_time
1858                                     segment_d = s['d']
1859                                     add_segment_url()
1860                                     segment_number += 1
1861                                     for r in range(s.get('r', 0)):
1862                                         segment_time += segment_d
1863                                         add_segment_url()
1864                                         segment_number += 1
1865                                     segment_time += segment_d
1866                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1867                             # No media template
1868                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1869                             # or any YouTube dashsegments video
1870                             fragments = []
1871                             segment_index = 0
1872                             timescale = representation_ms_info['timescale']
1873                             for s in representation_ms_info['s']:
1874                                 duration = float_or_none(s['d'], timescale)
1875                                 for r in range(s.get('r', 0) + 1):
1876                                     fragments.append({
1877                                         'url': representation_ms_info['segment_urls'][segment_index],
1878                                         'duration': duration,
1879                                     })
1880                                     segment_index += 1
1881                             representation_ms_info['fragments'] = fragments
1882                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1883                         # No fragments key is present in this case.
1884                         if 'fragments' in representation_ms_info:
1885                             f.update({
1886                                 'fragments': [],
1887                                 'protocol': 'http_dash_segments',
1888                             })
1889                             if 'initialization_url' in representation_ms_info:
1890                                 initialization_url = representation_ms_info['initialization_url']
1891                                 if not f.get('url'):
1892                                     f['url'] = initialization_url
1893                                 f['fragments'].append({'url': initialization_url})
1894                             f['fragments'].extend(representation_ms_info['fragments'])
1895                             for fragment in f['fragments']:
1896                                 fragment['url'] = urljoin(base_url, fragment['url'])
1897                         try:
1898                             existing_format = next(
1899                                 fo for fo in formats
1900                                 if fo['format_id'] == representation_id)
1901                         except StopIteration:
1902                             full_info = formats_dict.get(representation_id, {}).copy()
1903                             full_info.update(f)
1904                             formats.append(full_info)
1905                         else:
1906                             existing_format.update(f)
1907                     else:
1908                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1909         return formats
1910
1911     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1912         res = self._download_webpage_handle(
1913             ism_url, video_id,
1914             note=note or 'Downloading ISM manifest',
1915             errnote=errnote or 'Failed to download ISM manifest',
1916             fatal=fatal)
1917         if res is False:
1918             return []
1919         ism, urlh = res
1920
1921         return self._parse_ism_formats(
1922             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1923
1924     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1925         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1926             return []
1927
1928         duration = int(ism_doc.attrib['Duration'])
1929         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1930
1931         formats = []
1932         for stream in ism_doc.findall('StreamIndex'):
1933             stream_type = stream.get('Type')
1934             if stream_type not in ('video', 'audio'):
1935                 continue
1936             url_pattern = stream.attrib['Url']
1937             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1938             stream_name = stream.get('Name')
1939             for track in stream.findall('QualityLevel'):
1940                 fourcc = track.get('FourCC')
1941                 # TODO: add support for WVC1 and WMAP
1942                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1943                     self.report_warning('%s is not a supported codec' % fourcc)
1944                     continue
1945                 tbr = int(track.attrib['Bitrate']) // 1000
1946                 width = int_or_none(track.get('MaxWidth'))
1947                 height = int_or_none(track.get('MaxHeight'))
1948                 sampling_rate = int_or_none(track.get('SamplingRate'))
1949
1950                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1951                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1952
1953                 fragments = []
1954                 fragment_ctx = {
1955                     'time': 0,
1956                 }
1957                 stream_fragments = stream.findall('c')
1958                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1959                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1960                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1961                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1962                     if not fragment_ctx['duration']:
1963                         try:
1964                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1965                         except IndexError:
1966                             next_fragment_time = duration
1967                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1968                     for _ in range(fragment_repeat):
1969                         fragments.append({
1970                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1971                             'duration': fragment_ctx['duration'] / stream_timescale,
1972                         })
1973                         fragment_ctx['time'] += fragment_ctx['duration']
1974
1975                 format_id = []
1976                 if ism_id:
1977                     format_id.append(ism_id)
1978                 if stream_name:
1979                     format_id.append(stream_name)
1980                 format_id.append(compat_str(tbr))
1981
1982                 formats.append({
1983                     'format_id': '-'.join(format_id),
1984                     'url': ism_url,
1985                     'manifest_url': ism_url,
1986                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1987                     'width': width,
1988                     'height': height,
1989                     'tbr': tbr,
1990                     'asr': sampling_rate,
1991                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1992                     'acodec': 'none' if stream_type == 'video' else fourcc,
1993                     'protocol': 'ism',
1994                     'fragments': fragments,
1995                     '_download_params': {
1996                         'duration': duration,
1997                         'timescale': stream_timescale,
1998                         'width': width or 0,
1999                         'height': height or 0,
2000                         'fourcc': fourcc,
2001                         'codec_private_data': track.get('CodecPrivateData'),
2002                         'sampling_rate': sampling_rate,
2003                         'channels': int_or_none(track.get('Channels', 2)),
2004                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2005                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2006                     },
2007                 })
2008         return formats
2009
2010     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
2011         def absolute_url(video_url):
2012             return compat_urlparse.urljoin(base_url, video_url)
2013
2014         def parse_content_type(content_type):
2015             if not content_type:
2016                 return {}
2017             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2018             if ctr:
2019                 mimetype, codecs = ctr.groups()
2020                 f = parse_codecs(codecs)
2021                 f['ext'] = mimetype2ext(mimetype)
2022                 return f
2023             return {}
2024
2025         def _media_formats(src, cur_media_type):
2026             full_url = absolute_url(src)
2027             ext = determine_ext(full_url)
2028             if ext == 'm3u8':
2029                 is_plain_url = False
2030                 formats = self._extract_m3u8_formats(
2031                     full_url, video_id, ext='mp4',
2032                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
2033             elif ext == 'mpd':
2034                 is_plain_url = False
2035                 formats = self._extract_mpd_formats(
2036                     full_url, video_id, mpd_id=mpd_id)
2037             else:
2038                 is_plain_url = True
2039                 formats = [{
2040                     'url': full_url,
2041                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2042                 }]
2043             return is_plain_url, formats
2044
2045         entries = []
2046         media_tags = [(media_tag, media_type, '')
2047                       for media_tag, media_type
2048                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2049         media_tags.extend(re.findall(
2050             # We only allow video|audio followed by a whitespace or '>'.
2051             # Allowing more characters may end up in significant slow down (see
2052             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2053             # http://www.porntrex.com/maps/videositemap.xml).
2054             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2055         for media_tag, media_type, media_content in media_tags:
2056             media_info = {
2057                 'formats': [],
2058                 'subtitles': {},
2059             }
2060             media_attributes = extract_attributes(media_tag)
2061             src = media_attributes.get('src')
2062             if src:
2063                 _, formats = _media_formats(src, media_type)
2064                 media_info['formats'].extend(formats)
2065             media_info['thumbnail'] = media_attributes.get('poster')
2066             if media_content:
2067                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2068                     source_attributes = extract_attributes(source_tag)
2069                     src = source_attributes.get('src')
2070                     if not src:
2071                         continue
2072                     is_plain_url, formats = _media_formats(src, media_type)
2073                     if is_plain_url:
2074                         f = parse_content_type(source_attributes.get('type'))
2075                         f.update(formats[0])
2076                         media_info['formats'].append(f)
2077                     else:
2078                         media_info['formats'].extend(formats)
2079                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2080                     track_attributes = extract_attributes(track_tag)
2081                     kind = track_attributes.get('kind')
2082                     if not kind or kind in ('subtitles', 'captions'):
2083                         src = track_attributes.get('src')
2084                         if not src:
2085                             continue
2086                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2087                         media_info['subtitles'].setdefault(lang, []).append({
2088                             'url': absolute_url(src),
2089                         })
2090             if media_info['formats'] or media_info['subtitles']:
2091                 entries.append(media_info)
2092         return entries
2093
2094     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2095         formats = []
2096         hdcore_sign = 'hdcore=3.7.0'
2097         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2098         hds_host = hosts.get('hds')
2099         if hds_host:
2100             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2101         if 'hdcore=' not in f4m_url:
2102             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2103         f4m_formats = self._extract_f4m_formats(
2104             f4m_url, video_id, f4m_id='hds', fatal=False)
2105         for entry in f4m_formats:
2106             entry.update({'extra_param_to_segment_url': hdcore_sign})
2107         formats.extend(f4m_formats)
2108         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2109         hls_host = hosts.get('hls')
2110         if hls_host:
2111             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2112         formats.extend(self._extract_m3u8_formats(
2113             m3u8_url, video_id, 'mp4', 'm3u8_native',
2114             m3u8_id='hls', fatal=False))
2115         return formats
2116
2117     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2118         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2119         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2120         http_base_url = 'http' + url_base
2121         formats = []
2122         if 'm3u8' not in skip_protocols:
2123             formats.extend(self._extract_m3u8_formats(
2124                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2125                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2126         if 'f4m' not in skip_protocols:
2127             formats.extend(self._extract_f4m_formats(
2128                 http_base_url + '/manifest.f4m',
2129                 video_id, f4m_id='hds', fatal=False))
2130         if 'dash' not in skip_protocols:
2131             formats.extend(self._extract_mpd_formats(
2132                 http_base_url + '/manifest.mpd',
2133                 video_id, mpd_id='dash', fatal=False))
2134         if re.search(r'(?:/smil:|\.smil)', url_base):
2135             if 'smil' not in skip_protocols:
2136                 rtmp_formats = self._extract_smil_formats(
2137                     http_base_url + '/jwplayer.smil',
2138                     video_id, fatal=False)
2139                 for rtmp_format in rtmp_formats:
2140                     rtsp_format = rtmp_format.copy()
2141                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2142                     del rtsp_format['play_path']
2143                     del rtsp_format['ext']
2144                     rtsp_format.update({
2145                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2146                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2147                         'protocol': 'rtsp',
2148                     })
2149                     formats.extend([rtmp_format, rtsp_format])
2150         else:
2151             for protocol in ('rtmp', 'rtsp'):
2152                 if protocol not in skip_protocols:
2153                     formats.append({
2154                         'url': protocol + url_base,
2155                         'format_id': protocol,
2156                         'protocol': protocol,
2157                     })
2158         return formats
2159
2160     @staticmethod
2161     def _find_jwplayer_data(webpage):
2162         mobj = re.search(
2163             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2164             webpage)
2165         if mobj:
2166             return mobj.group('options')
2167
2168     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2169         jwplayer_data = self._parse_json(
2170             self._find_jwplayer_data(webpage), video_id,
2171             transform_source=js_to_json)
2172         return self._parse_jwplayer_data(
2173             jwplayer_data, video_id, *args, **kwargs)
2174
2175     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2176                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2177         # JWPlayer backward compatibility: flattened playlists
2178         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2179         if 'playlist' not in jwplayer_data:
2180             jwplayer_data = {'playlist': [jwplayer_data]}
2181
2182         entries = []
2183
2184         # JWPlayer backward compatibility: single playlist item
2185         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2186         if not isinstance(jwplayer_data['playlist'], list):
2187             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2188
2189         for video_data in jwplayer_data['playlist']:
2190             # JWPlayer backward compatibility: flattened sources
2191             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2192             if 'sources' not in video_data:
2193                 video_data['sources'] = [video_data]
2194
2195             this_video_id = video_id or video_data['mediaid']
2196
2197             formats = []
2198             for source in video_data['sources']:
2199                 source_url = self._proto_relative_url(source['file'])
2200                 if base_url:
2201                     source_url = compat_urlparse.urljoin(base_url, source_url)
2202                 source_type = source.get('type') or ''
2203                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2204                 if source_type == 'hls' or ext == 'm3u8':
2205                     formats.extend(self._extract_m3u8_formats(
2206                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2207                 elif ext == 'mpd':
2208                     formats.extend(self._extract_mpd_formats(
2209                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2210                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2211                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2212                     formats.append({
2213                         'url': source_url,
2214                         'vcodec': 'none',
2215                         'ext': ext,
2216                     })
2217                 else:
2218                     height = int_or_none(source.get('height'))
2219                     if height is None:
2220                         # Often no height is provided but there is a label in
2221                         # format like 1080p.
2222                         height = int_or_none(self._search_regex(
2223                             r'^(\d{3,})[pP]$', source.get('label') or '',
2224                             'height', default=None))
2225                     a_format = {
2226                         'url': source_url,
2227                         'width': int_or_none(source.get('width')),
2228                         'height': height,
2229                         'ext': ext,
2230                     }
2231                     if source_url.startswith('rtmp'):
2232                         a_format['ext'] = 'flv'
2233
2234                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2235                         # of jwplayer.flash.swf
2236                         rtmp_url_parts = re.split(
2237                             r'((?:mp4|mp3|flv):)', source_url, 1)
2238                         if len(rtmp_url_parts) == 3:
2239                             rtmp_url, prefix, play_path = rtmp_url_parts
2240                             a_format.update({
2241                                 'url': rtmp_url,
2242                                 'play_path': prefix + play_path,
2243                             })
2244                         if rtmp_params:
2245                             a_format.update(rtmp_params)
2246                     formats.append(a_format)
2247             self._sort_formats(formats)
2248
2249             subtitles = {}
2250             tracks = video_data.get('tracks')
2251             if tracks and isinstance(tracks, list):
2252                 for track in tracks:
2253                     if track.get('kind') != 'captions':
2254                         continue
2255                     track_url = urljoin(base_url, track.get('file'))
2256                     if not track_url:
2257                         continue
2258                     subtitles.setdefault(track.get('label') or 'en', []).append({
2259                         'url': self._proto_relative_url(track_url)
2260                     })
2261
2262             entries.append({
2263                 'id': this_video_id,
2264                 'title': video_data['title'] if require_title else video_data.get('title'),
2265                 'description': video_data.get('description'),
2266                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2267                 'timestamp': int_or_none(video_data.get('pubdate')),
2268                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2269                 'subtitles': subtitles,
2270                 'formats': formats,
2271             })
2272         if len(entries) == 1:
2273             return entries[0]
2274         else:
2275             return self.playlist_result(entries)
2276
2277     def _live_title(self, name):
2278         """ Generate the title for a live video """
2279         now = datetime.datetime.now()
2280         now_str = now.strftime('%Y-%m-%d %H:%M')
2281         return name + ' ' + now_str
2282
2283     def _int(self, v, name, fatal=False, **kwargs):
2284         res = int_or_none(v, **kwargs)
2285         if 'get_attr' in kwargs:
2286             print(getattr(v, kwargs['get_attr']))
2287         if res is None:
2288             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2289             if fatal:
2290                 raise ExtractorError(msg)
2291             else:
2292                 self._downloader.report_warning(msg)
2293         return res
2294
2295     def _float(self, v, name, fatal=False, **kwargs):
2296         res = float_or_none(v, **kwargs)
2297         if res is None:
2298             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2299             if fatal:
2300                 raise ExtractorError(msg)
2301             else:
2302                 self._downloader.report_warning(msg)
2303         return res
2304
2305     def _set_cookie(self, domain, name, value, expire_time=None):
2306         cookie = compat_cookiejar.Cookie(
2307             0, name, value, None, None, domain, None,
2308             None, '/', True, False, expire_time, '', None, None, None)
2309         self._downloader.cookiejar.set_cookie(cookie)
2310
2311     def _get_cookies(self, url):
2312         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2313         req = sanitized_Request(url)
2314         self._downloader.cookiejar.add_cookie_header(req)
2315         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2316
2317     def get_testcases(self, include_onlymatching=False):
2318         t = getattr(self, '_TEST', None)
2319         if t:
2320             assert not hasattr(self, '_TESTS'), \
2321                 '%s has _TEST and _TESTS' % type(self).__name__
2322             tests = [t]
2323         else:
2324             tests = getattr(self, '_TESTS', [])
2325         for t in tests:
2326             if not include_onlymatching and t.get('only_matching', False):
2327                 continue
2328             t['name'] = type(self).__name__[:-len('IE')]
2329             yield t
2330
2331     def is_suitable(self, age_limit):
2332         """ Test whether the extractor is generally suitable for the given
2333         age limit (i.e. pornographic sites are not, all others usually are) """
2334
2335         any_restricted = False
2336         for tc in self.get_testcases(include_onlymatching=False):
2337             if tc.get('playlist', []):
2338                 tc = tc['playlist'][0]
2339             is_restricted = age_restricted(
2340                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2341             if not is_restricted:
2342                 return True
2343             any_restricted = any_restricted or is_restricted
2344         return not any_restricted
2345
2346     def extract_subtitles(self, *args, **kwargs):
2347         if (self._downloader.params.get('writesubtitles', False) or
2348                 self._downloader.params.get('listsubtitles')):
2349             return self._get_subtitles(*args, **kwargs)
2350         return {}
2351
2352     def _get_subtitles(self, *args, **kwargs):
2353         raise NotImplementedError('This method must be implemented by subclasses')
2354
2355     @staticmethod
2356     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2357         """ Merge subtitle items for one language. Items with duplicated URLs
2358         will be dropped. """
2359         list1_urls = set([item['url'] for item in subtitle_list1])
2360         ret = list(subtitle_list1)
2361         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2362         return ret
2363
2364     @classmethod
2365     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2366         """ Merge two subtitle dictionaries, language by language. """
2367         ret = dict(subtitle_dict1)
2368         for lang in subtitle_dict2:
2369             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2370         return ret
2371
2372     def extract_automatic_captions(self, *args, **kwargs):
2373         if (self._downloader.params.get('writeautomaticsub', False) or
2374                 self._downloader.params.get('listsubtitles')):
2375             return self._get_automatic_captions(*args, **kwargs)
2376         return {}
2377
2378     def _get_automatic_captions(self, *args, **kwargs):
2379         raise NotImplementedError('This method must be implemented by subclasses')
2380
2381     def mark_watched(self, *args, **kwargs):
2382         if (self._downloader.params.get('mark_watched', False) and
2383                 (self._get_login_info()[0] is not None or
2384                     self._downloader.params.get('cookiefile') is not None)):
2385             self._mark_watched(*args, **kwargs)
2386
2387     def _mark_watched(self, *args, **kwargs):
2388         raise NotImplementedError('This method must be implemented by subclasses')
2389
2390     def geo_verification_headers(self):
2391         headers = {}
2392         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2393         if geo_verification_proxy:
2394             headers['Ytdl-request-proxy'] = geo_verification_proxy
2395         return headers
2396
2397     def _generic_id(self, url):
2398         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2399
2400     def _generic_title(self, url):
2401         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2402
2403
2404 class SearchInfoExtractor(InfoExtractor):
2405     """
2406     Base class for paged search queries extractors.
2407     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2408     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2409     """
2410
2411     @classmethod
2412     def _make_valid_url(cls):
2413         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2414
2415     @classmethod
2416     def suitable(cls, url):
2417         return re.match(cls._make_valid_url(), url) is not None
2418
2419     def _real_extract(self, query):
2420         mobj = re.match(self._make_valid_url(), query)
2421         if mobj is None:
2422             raise ExtractorError('Invalid search query "%s"' % query)
2423
2424         prefix = mobj.group('prefix')
2425         query = mobj.group('query')
2426         if prefix == '':
2427             return self._get_n_results(query, 1)
2428         elif prefix == 'all':
2429             return self._get_n_results(query, self._MAX_RESULTS)
2430         else:
2431             n = int(prefix)
2432             if n <= 0:
2433                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2434             elif n > self._MAX_RESULTS:
2435                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2436                 n = self._MAX_RESULTS
2437             return self._get_n_results(query, n)
2438
2439     def _get_n_results(self, query, n):
2440         """Get a specified number of results for a query"""
2441         raise NotImplementedError('This method must be implemented by subclasses')
2442
2443     @property
2444     def SEARCH_KEY(self):
2445         return self._SEARCH_KEY