[extractor/common] Print origin country for fake IP
[youtube-dl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import random
10 import re
11 import socket
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17     compat_cookiejar,
18     compat_cookies,
19     compat_etree_fromstring,
20     compat_getpass,
21     compat_http_client,
22     compat_os_name,
23     compat_str,
24     compat_urllib_error,
25     compat_urllib_parse_unquote,
26     compat_urllib_parse_urlencode,
27     compat_urllib_request,
28     compat_urlparse,
29 )
30 from ..downloader.f4m import remove_encrypted_media
31 from ..utils import (
32     NO_DEFAULT,
33     age_restricted,
34     base_url,
35     bug_reports_message,
36     clean_html,
37     compiled_regex_type,
38     determine_ext,
39     error_to_compat_str,
40     ExtractorError,
41     fix_xml_ampersands,
42     float_or_none,
43     GeoRestrictedError,
44     GeoUtils,
45     int_or_none,
46     js_to_json,
47     parse_iso8601,
48     RegexNotFoundError,
49     sanitize_filename,
50     sanitized_Request,
51     unescapeHTML,
52     unified_strdate,
53     unified_timestamp,
54     url_basename,
55     xpath_element,
56     xpath_text,
57     xpath_with_ns,
58     determine_protocol,
59     parse_duration,
60     mimetype2ext,
61     update_Request,
62     update_url_query,
63     parse_m3u8_attributes,
64     extract_attributes,
65     parse_codecs,
66     urljoin,
67 )
68
69
70 class InfoExtractor(object):
71     """Information Extractor class.
72
73     Information extractors are the classes that, given a URL, extract
74     information about the video (or videos) the URL refers to. This
75     information includes the real video URL, the video title, author and
76     others. The information is stored in a dictionary which is then
77     passed to the YoutubeDL. The YoutubeDL processes this
78     information possibly downloading the video to the file system, among
79     other possible outcomes.
80
81     The type field determines the type of the result.
82     By far the most common value (and the default if _type is missing) is
83     "video", which indicates a single video.
84
85     For a video, the dictionaries must include the following fields:
86
87     id:             Video identifier.
88     title:          Video title, unescaped.
89
90     Additionally, it must contain either a formats entry or a url one:
91
92     formats:        A list of dictionaries for each format available, ordered
93                     from worst to best quality.
94
95                     Potential fields:
96                     * url        Mandatory. The URL of the video file
97                     * manifest_url
98                                  The URL of the manifest file in case of
99                                  fragmented media (DASH, hls, hds)
100                     * ext        Will be calculated from URL if missing
101                     * format     A human-readable description of the format
102                                  ("mp4 container with h264/opus").
103                                  Calculated from the format_id, width, height.
104                                  and format_note fields if missing.
105                     * format_id  A short description of the format
106                                  ("mp4_h264_opus" or "19").
107                                 Technically optional, but strongly recommended.
108                     * format_note Additional info about the format
109                                  ("3D" or "DASH video")
110                     * width      Width of the video, if known
111                     * height     Height of the video, if known
112                     * resolution Textual description of width and height
113                     * tbr        Average bitrate of audio and video in KBit/s
114                     * abr        Average audio bitrate in KBit/s
115                     * acodec     Name of the audio codec in use
116                     * asr        Audio sampling rate in Hertz
117                     * vbr        Average video bitrate in KBit/s
118                     * fps        Frame rate
119                     * vcodec     Name of the video codec in use
120                     * container  Name of the container format
121                     * filesize   The number of bytes, if known in advance
122                     * filesize_approx  An estimate for the number of bytes
123                     * player_url SWF Player URL (used for rtmpdump).
124                     * protocol   The protocol that will be used for the actual
125                                  download, lower-case.
126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
127                                  "m3u8", "m3u8_native" or "http_dash_segments".
128                     * fragment_base_url
129                                  Base URL for fragments. Each fragment's path
130                                  value (if present) will be relative to
131                                  this URL.
132                     * fragments  A list of fragments of a fragmented media.
133                                  Each fragment entry must contain either an url
134                                  or a path. If an url is present it should be
135                                  considered by a client. Otherwise both path and
136                                  fragment_base_url must be present. Here is
137                                  the list of all potential fields:
138                                  * "url" - fragment's URL
139                                  * "path" - fragment's path relative to
140                                             fragment_base_url
141                                  * "duration" (optional, int or float)
142                                  * "filesize" (optional, int)
143                     * preference Order number of this format. If this field is
144                                  present and not None, the formats get sorted
145                                  by this field, regardless of all other values.
146                                  -1 for default (order by other properties),
147                                  -2 or smaller for less than default.
148                                  < -1000 to hide the format (if there is
149                                     another one which is strictly better)
150                     * language   Language code, e.g. "de" or "en-US".
151                     * language_preference  Is this in the language mentioned in
152                                  the URL?
153                                  10 if it's what the URL is about,
154                                  -1 for default (don't know),
155                                  -10 otherwise, other values reserved for now.
156                     * quality    Order number of the video quality of this
157                                  format, irrespective of the file format.
158                                  -1 for default (order by other properties),
159                                  -2 or smaller for less than default.
160                     * source_preference  Order number for this video source
161                                   (quality takes higher priority)
162                                  -1 for default (order by other properties),
163                                  -2 or smaller for less than default.
164                     * http_headers  A dictionary of additional HTTP headers
165                                  to add to the request.
166                     * stretched_ratio  If given and not 1, indicates that the
167                                  video's pixels are not square.
168                                  width : height ratio as float.
169                     * no_resume  The server does not support resuming the
170                                  (HTTP or RTMP) download. Boolean.
171
172     url:            Final video URL.
173     ext:            Video filename extension.
174     format:         The video format, defaults to ext (used for --get-format)
175     player_url:     SWF Player URL (used for rtmpdump).
176
177     The following fields are optional:
178
179     alt_title:      A secondary title of the video.
180     display_id      An alternative identifier for the video, not necessarily
181                     unique, but available before title. Typically, id is
182                     something like "4234987", title "Dancing naked mole rats",
183                     and display_id "dancing-naked-mole-rats"
184     thumbnails:     A list of dictionaries, with the following entries:
185                         * "id" (optional, string) - Thumbnail format ID
186                         * "url"
187                         * "preference" (optional, int) - quality of the image
188                         * "width" (optional, int)
189                         * "height" (optional, int)
190                         * "resolution" (optional, string "{width}x{height"},
191                                         deprecated)
192                         * "filesize" (optional, int)
193     thumbnail:      Full URL to a video thumbnail image.
194     description:    Full video description.
195     uploader:       Full name of the video uploader.
196     license:        License name the video is licensed under.
197     creator:        The creator of the video.
198     release_date:   The date (YYYYMMDD) when the video was released.
199     timestamp:      UNIX timestamp of the moment the video became available.
200     upload_date:    Video upload date (YYYYMMDD).
201                     If not explicitly set, calculated from timestamp.
202     uploader_id:    Nickname or id of the video uploader.
203     uploader_url:   Full URL to a personal webpage of the video uploader.
204     location:       Physical location where the video was filmed.
205     subtitles:      The available subtitles as a dictionary in the format
206                     {tag: subformats}. "tag" is usually a language code, and
207                     "subformats" is a list sorted from lower to higher
208                     preference, each element is a dictionary with the "ext"
209                     entry and one of:
210                         * "data": The subtitles file contents
211                         * "url": A URL pointing to the subtitles file
212                     "ext" will be calculated from URL if missing
213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
214                     automatically generated captions
215     duration:       Length of the video in seconds, as an integer or float.
216     view_count:     How many users have watched the video on the platform.
217     like_count:     Number of positive ratings of the video
218     dislike_count:  Number of negative ratings of the video
219     repost_count:   Number of reposts of the video
220     average_rating: Average rating give by users, the scale used depends on the webpage
221     comment_count:  Number of comments on the video
222     comments:       A list of comments, each with one or more of the following
223                     properties (all but one of text or html optional):
224                         * "author" - human-readable name of the comment author
225                         * "author_id" - user ID of the comment author
226                         * "id" - Comment ID
227                         * "html" - Comment as HTML
228                         * "text" - Plain text of the comment
229                         * "timestamp" - UNIX timestamp of comment
230                         * "parent" - ID of the comment this one is replying to.
231                                      Set to "root" to indicate that this is a
232                                      comment to the original video.
233     age_limit:      Age restriction for the video, as an integer (years)
234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
235                     should allow to get the same result again. (It will be set
236                     by YoutubeDL if it's missing)
237     categories:     A list of categories that the video falls in, for example
238                     ["Sports", "Berlin"]
239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240     is_live:        True, False, or None (=unknown). Whether this video is a
241                     live stream that goes on instead of a fixed-length video.
242     start_time:     Time in seconds where the reproduction should start, as
243                     specified in the URL.
244     end_time:       Time in seconds where the reproduction should end, as
245                     specified in the URL.
246
247     The following fields should only be used when the video belongs to some logical
248     chapter or section:
249
250     chapter:        Name or title of the chapter the video belongs to.
251     chapter_number: Number of the chapter the video belongs to, as an integer.
252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
253
254     The following fields should only be used when the video is an episode of some
255     series, programme or podcast:
256
257     series:         Title of the series or programme the video episode belongs to.
258     season:         Title of the season the video episode belongs to.
259     season_number:  Number of the season the video episode belongs to, as an integer.
260     season_id:      Id of the season the video episode belongs to, as a unicode string.
261     episode:        Title of the video episode. Unlike mandatory video title field,
262                     this field should denote the exact title of the video episode
263                     without any kind of decoration.
264     episode_number: Number of the video episode within a season, as an integer.
265     episode_id:     Id of the video episode, as a unicode string.
266
267     The following fields should only be used when the media is a track or a part of
268     a music album:
269
270     track:          Title of the track.
271     track_number:   Number of the track within an album or a disc, as an integer.
272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
273                     as a unicode string.
274     artist:         Artist(s) of the track.
275     genre:          Genre(s) of the track.
276     album:          Title of the album the track belongs to.
277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278     album_artist:   List of all artists appeared on the album (e.g.
279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
280                     and compilations).
281     disc_number:    Number of the disc or other physical medium the track belongs to,
282                     as an integer.
283     release_year:   Year (YYYY) when the album was released.
284
285     Unless mentioned otherwise, the fields should be Unicode strings.
286
287     Unless mentioned otherwise, None is equivalent to absence of information.
288
289
290     _type "playlist" indicates multiple videos.
291     There must be a key "entries", which is a list, an iterable, or a PagedList
292     object, each element of which is a valid dictionary by this specification.
293
294     Additionally, playlists can have "title", "description" and "id" attributes
295     with the same semantics as videos (see above).
296
297
298     _type "multi_video" indicates that there are multiple videos that
299     form a single show, for examples multiple acts of an opera or TV episode.
300     It must have an entries key like a playlist and contain all the keys
301     required for a video at the same time.
302
303
304     _type "url" indicates that the video must be extracted from another
305     location, possibly by a different extractor. Its only required key is:
306     "url" - the next URL to extract.
307     The key "ie_key" can be set to the class name (minus the trailing "IE",
308     e.g. "Youtube") if the extractor class is known in advance.
309     Additionally, the dictionary may have any properties of the resolved entity
310     known in advance, for example "title" if the title of the referred video is
311     known ahead of time.
312
313
314     _type "url_transparent" entities have the same specification as "url", but
315     indicate that the given additional information is more precise than the one
316     associated with the resolved URL.
317     This is useful when a site employs a video service that hosts the video and
318     its technical metadata, but that video service does not embed a useful
319     title, description etc.
320
321
322     Subclasses of this one should re-define the _real_initialize() and
323     _real_extract() methods and define a _VALID_URL regexp.
324     Probably, they should also be added to the list of extractors.
325
326     _GEO_BYPASS attribute may be set to False in order to disable
327     geo restriction bypass mechanisms for a particular extractor.
328     Though it won't disable explicit geo restriction bypass based on
329     country code provided with geo_bypass_country. (experimental)
330
331     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
332     countries for this extractor. One of these countries will be used by
333     geo restriction bypass mechanism right away in order to bypass
334     geo restriction, of course, if the mechanism is not disabled. (experimental)
335
336     NB: both these geo attributes are experimental and may change in future
337     or be completely removed.
338
339     Finally, the _WORKING attribute should be set to False for broken IEs
340     in order to warn the users and skip the tests.
341     """
342
343     _ready = False
344     _downloader = None
345     _x_forwarded_for_ip = None
346     _GEO_BYPASS = True
347     _GEO_COUNTRIES = None
348     _WORKING = True
349
350     def __init__(self, downloader=None):
351         """Constructor. Receives an optional downloader."""
352         self._ready = False
353         self._x_forwarded_for_ip = None
354         self.set_downloader(downloader)
355
356     @classmethod
357     def suitable(cls, url):
358         """Receives a URL and returns True if suitable for this IE."""
359
360         # This does not use has/getattr intentionally - we want to know whether
361         # we have cached the regexp for *this* class, whereas getattr would also
362         # match the superclass
363         if '_VALID_URL_RE' not in cls.__dict__:
364             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
365         return cls._VALID_URL_RE.match(url) is not None
366
367     @classmethod
368     def _match_id(cls, url):
369         if '_VALID_URL_RE' not in cls.__dict__:
370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
371         m = cls._VALID_URL_RE.match(url)
372         assert m
373         return m.group('id')
374
375     @classmethod
376     def working(cls):
377         """Getter method for _WORKING."""
378         return cls._WORKING
379
380     def initialize(self):
381         """Initializes an instance (authentication, etc)."""
382         self._initialize_geo_bypass(self._GEO_COUNTRIES)
383         if not self._ready:
384             self._real_initialize()
385             self._ready = True
386
387     def _initialize_geo_bypass(self, countries):
388         """
389         Initialize geo restriction bypass mechanism.
390
391         This method is used to initialize geo bypass mechanism based on faking
392         X-Forwarded-For HTTP header. A random country from provided country list
393         is selected and a random IP belonging to this country is generated. This
394         IP will be passed as X-Forwarded-For HTTP header in all subsequent
395         HTTP requests.
396
397         This method will be used for initial geo bypass mechanism initialization
398         during the instance initialization with _GEO_COUNTRIES.
399
400         You may also manually call it from extractor's code if geo countries
401         information is not available beforehand (e.g. obtained during
402         extraction) or due to some another reason.
403         """
404         if not self._x_forwarded_for_ip:
405             country_code = self._downloader.params.get('geo_bypass_country', None)
406             # If there is no explicit country for geo bypass specified and
407             # the extractor is known to be geo restricted let's fake IP
408             # as X-Forwarded-For right away.
409             if (not country_code and
410                     self._GEO_BYPASS and
411                     self._downloader.params.get('geo_bypass', True) and
412                     countries):
413                 country_code = random.choice(countries)
414             if country_code:
415                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
416                 if self._downloader.params.get('verbose', False):
417                     self._downloader.to_stdout(
418                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
419                         % (self._x_forwarded_for_ip, country_code.upper()))
420
421     def extract(self, url):
422         """Extracts URL information and returns it in list of dicts."""
423         try:
424             for _ in range(2):
425                 try:
426                     self.initialize()
427                     ie_result = self._real_extract(url)
428                     if self._x_forwarded_for_ip:
429                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
430                     return ie_result
431                 except GeoRestrictedError as e:
432                     if self.__maybe_fake_ip_and_retry(e.countries):
433                         continue
434                     raise
435         except ExtractorError:
436             raise
437         except compat_http_client.IncompleteRead as e:
438             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
439         except (KeyError, StopIteration) as e:
440             raise ExtractorError('An extractor error has occurred.', cause=e)
441
442     def __maybe_fake_ip_and_retry(self, countries):
443         if (not self._downloader.params.get('geo_bypass_country', None) and
444                 self._GEO_BYPASS and
445                 self._downloader.params.get('geo_bypass', True) and
446                 not self._x_forwarded_for_ip and
447                 countries):
448             country_code = random.choice(countries)
449             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
450             if self._x_forwarded_for_ip:
451                 self.report_warning(
452                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
453                     % (self._x_forwarded_for_ip, country_code.upper()))
454                 return True
455         return False
456
457     def set_downloader(self, downloader):
458         """Sets the downloader for this IE."""
459         self._downloader = downloader
460
461     def _real_initialize(self):
462         """Real initialization process. Redefine in subclasses."""
463         pass
464
465     def _real_extract(self, url):
466         """Real extraction process. Redefine in subclasses."""
467         pass
468
469     @classmethod
470     def ie_key(cls):
471         """A string for getting the InfoExtractor with get_info_extractor"""
472         return compat_str(cls.__name__[:-2])
473
474     @property
475     def IE_NAME(self):
476         return compat_str(type(self).__name__[:-2])
477
478     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
479         """ Returns the response handle """
480         if note is None:
481             self.report_download_webpage(video_id)
482         elif note is not False:
483             if video_id is None:
484                 self.to_screen('%s' % (note,))
485             else:
486                 self.to_screen('%s: %s' % (video_id, note))
487         if isinstance(url_or_request, compat_urllib_request.Request):
488             url_or_request = update_Request(
489                 url_or_request, data=data, headers=headers, query=query)
490         else:
491             if query:
492                 url_or_request = update_url_query(url_or_request, query)
493             if data is not None or headers:
494                 url_or_request = sanitized_Request(url_or_request, data, headers)
495         try:
496             return self._downloader.urlopen(url_or_request)
497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498             if errnote is False:
499                 return False
500             if errnote is None:
501                 errnote = 'Unable to download webpage'
502
503             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
504             if fatal:
505                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
506             else:
507                 self._downloader.report_warning(errmsg)
508                 return False
509
510     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
511         """ Returns a tuple (page content as string, URL handle) """
512         # Strip hashes from the URL (#1038)
513         if isinstance(url_or_request, (compat_str, str)):
514             url_or_request = url_or_request.partition('#')[0]
515
516         # Some sites check X-Forwarded-For HTTP header in order to figure out
517         # the origin of the client behind proxy. This allows bypassing geo
518         # restriction by faking this header's value to IP that belongs to some
519         # geo unrestricted country. We will do so once we encounter any
520         # geo restriction error.
521         if self._x_forwarded_for_ip:
522             if 'X-Forwarded-For' not in headers:
523                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
524
525         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
526         if urlh is False:
527             assert not fatal
528             return False
529         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
530         return (content, urlh)
531
532     @staticmethod
533     def _guess_encoding_from_content(content_type, webpage_bytes):
534         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
535         if m:
536             encoding = m.group(1)
537         else:
538             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
539                           webpage_bytes[:1024])
540             if m:
541                 encoding = m.group(1).decode('ascii')
542             elif webpage_bytes.startswith(b'\xff\xfe'):
543                 encoding = 'utf-16'
544             else:
545                 encoding = 'utf-8'
546
547         return encoding
548
549     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
550         content_type = urlh.headers.get('Content-Type', '')
551         webpage_bytes = urlh.read()
552         if prefix is not None:
553             webpage_bytes = prefix + webpage_bytes
554         if not encoding:
555             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
556         if self._downloader.params.get('dump_intermediate_pages', False):
557             try:
558                 url = url_or_request.get_full_url()
559             except AttributeError:
560                 url = url_or_request
561             self.to_screen('Dumping request to ' + url)
562             dump = base64.b64encode(webpage_bytes).decode('ascii')
563             self._downloader.to_screen(dump)
564         if self._downloader.params.get('write_pages', False):
565             try:
566                 url = url_or_request.get_full_url()
567             except AttributeError:
568                 url = url_or_request
569             basen = '%s_%s' % (video_id, url)
570             if len(basen) > 240:
571                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
572                 basen = basen[:240 - len(h)] + h
573             raw_filename = basen + '.dump'
574             filename = sanitize_filename(raw_filename, restricted=True)
575             self.to_screen('Saving request to ' + filename)
576             # Working around MAX_PATH limitation on Windows (see
577             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
578             if compat_os_name == 'nt':
579                 absfilepath = os.path.abspath(filename)
580                 if len(absfilepath) > 259:
581                     filename = '\\\\?\\' + absfilepath
582             with open(filename, 'wb') as outf:
583                 outf.write(webpage_bytes)
584
585         try:
586             content = webpage_bytes.decode(encoding, 'replace')
587         except LookupError:
588             content = webpage_bytes.decode('utf-8', 'replace')
589
590         if ('<title>Access to this site is blocked</title>' in content and
591                 'Websense' in content[:512]):
592             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
593             blocked_iframe = self._html_search_regex(
594                 r'<iframe src="([^"]+)"', content,
595                 'Websense information URL', default=None)
596             if blocked_iframe:
597                 msg += ' Visit %s for more details' % blocked_iframe
598             raise ExtractorError(msg, expected=True)
599         if '<title>The URL you requested has been blocked</title>' in content[:512]:
600             msg = (
601                 'Access to this webpage has been blocked by Indian censorship. '
602                 'Use a VPN or proxy server (with --proxy) to route around it.')
603             block_msg = self._html_search_regex(
604                 r'</h1><p>(.*?)</p>',
605                 content, 'block message', default=None)
606             if block_msg:
607                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
608             raise ExtractorError(msg, expected=True)
609
610         return content
611
612     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
613         """ Returns the data of the page as a string """
614         success = False
615         try_count = 0
616         while success is False:
617             try:
618                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
619                 success = True
620             except compat_http_client.IncompleteRead as e:
621                 try_count += 1
622                 if try_count >= tries:
623                     raise e
624                 self._sleep(timeout, video_id)
625         if res is False:
626             return res
627         else:
628             content, _ = res
629             return content
630
631     def _download_xml(self, url_or_request, video_id,
632                       note='Downloading XML', errnote='Unable to download XML',
633                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
634         """Return the xml as an xml.etree.ElementTree.Element"""
635         xml_string = self._download_webpage(
636             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
637         if xml_string is False:
638             return xml_string
639         if transform_source:
640             xml_string = transform_source(xml_string)
641         return compat_etree_fromstring(xml_string.encode('utf-8'))
642
643     def _download_json(self, url_or_request, video_id,
644                        note='Downloading JSON metadata',
645                        errnote='Unable to download JSON metadata',
646                        transform_source=None,
647                        fatal=True, encoding=None, data=None, headers={}, query={}):
648         json_string = self._download_webpage(
649             url_or_request, video_id, note, errnote, fatal=fatal,
650             encoding=encoding, data=data, headers=headers, query=query)
651         if (not fatal) and json_string is False:
652             return None
653         return self._parse_json(
654             json_string, video_id, transform_source=transform_source, fatal=fatal)
655
656     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
657         if transform_source:
658             json_string = transform_source(json_string)
659         try:
660             return json.loads(json_string)
661         except ValueError as ve:
662             errmsg = '%s: Failed to parse JSON ' % video_id
663             if fatal:
664                 raise ExtractorError(errmsg, cause=ve)
665             else:
666                 self.report_warning(errmsg + str(ve))
667
668     def report_warning(self, msg, video_id=None):
669         idstr = '' if video_id is None else '%s: ' % video_id
670         self._downloader.report_warning(
671             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
672
673     def to_screen(self, msg):
674         """Print msg to screen, prefixing it with '[ie_name]'"""
675         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
676
677     def report_extraction(self, id_or_name):
678         """Report information extraction."""
679         self.to_screen('%s: Extracting information' % id_or_name)
680
681     def report_download_webpage(self, video_id):
682         """Report webpage download."""
683         self.to_screen('%s: Downloading webpage' % video_id)
684
685     def report_age_confirmation(self):
686         """Report attempt to confirm age."""
687         self.to_screen('Confirming age')
688
689     def report_login(self):
690         """Report attempt to log in."""
691         self.to_screen('Logging in')
692
693     @staticmethod
694     def raise_login_required(msg='This video is only available for registered users'):
695         raise ExtractorError(
696             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
697             expected=True)
698
699     @staticmethod
700     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
701         raise GeoRestrictedError(msg, countries=countries)
702
703     # Methods for following #608
704     @staticmethod
705     def url_result(url, ie=None, video_id=None, video_title=None):
706         """Returns a URL that points to a page that should be processed"""
707         # TODO: ie should be the class used for getting the info
708         video_info = {'_type': 'url',
709                       'url': url,
710                       'ie_key': ie}
711         if video_id is not None:
712             video_info['id'] = video_id
713         if video_title is not None:
714             video_info['title'] = video_title
715         return video_info
716
717     @staticmethod
718     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
719         """Returns a playlist"""
720         video_info = {'_type': 'playlist',
721                       'entries': entries}
722         if playlist_id:
723             video_info['id'] = playlist_id
724         if playlist_title:
725             video_info['title'] = playlist_title
726         if playlist_description:
727             video_info['description'] = playlist_description
728         return video_info
729
730     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
731         """
732         Perform a regex search on the given string, using a single or a list of
733         patterns returning the first matching group.
734         In case of failure return a default value or raise a WARNING or a
735         RegexNotFoundError, depending on fatal, specifying the field name.
736         """
737         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
738             mobj = re.search(pattern, string, flags)
739         else:
740             for p in pattern:
741                 mobj = re.search(p, string, flags)
742                 if mobj:
743                     break
744
745         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
746             _name = '\033[0;34m%s\033[0m' % name
747         else:
748             _name = name
749
750         if mobj:
751             if group is None:
752                 # return the first matching group
753                 return next(g for g in mobj.groups() if g is not None)
754             else:
755                 return mobj.group(group)
756         elif default is not NO_DEFAULT:
757             return default
758         elif fatal:
759             raise RegexNotFoundError('Unable to extract %s' % _name)
760         else:
761             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
762             return None
763
764     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
765         """
766         Like _search_regex, but strips HTML tags and unescapes entities.
767         """
768         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
769         if res:
770             return clean_html(res).strip()
771         else:
772             return res
773
774     def _get_netrc_login_info(self, netrc_machine=None):
775         username = None
776         password = None
777         netrc_machine = netrc_machine or self._NETRC_MACHINE
778
779         if self._downloader.params.get('usenetrc', False):
780             try:
781                 info = netrc.netrc().authenticators(netrc_machine)
782                 if info is not None:
783                     username = info[0]
784                     password = info[2]
785                 else:
786                     raise netrc.NetrcParseError(
787                         'No authenticators for %s' % netrc_machine)
788             except (IOError, netrc.NetrcParseError) as err:
789                 self._downloader.report_warning(
790                     'parsing .netrc: %s' % error_to_compat_str(err))
791
792         return username, password
793
794     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
795         """
796         Get the login info as (username, password)
797         First look for the manually specified credentials using username_option
798         and password_option as keys in params dictionary. If no such credentials
799         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
800         value.
801         If there's no info available, return (None, None)
802         """
803         if self._downloader is None:
804             return (None, None)
805
806         downloader_params = self._downloader.params
807
808         # Attempt to use provided username and password or .netrc data
809         if downloader_params.get(username_option) is not None:
810             username = downloader_params[username_option]
811             password = downloader_params[password_option]
812         else:
813             username, password = self._get_netrc_login_info(netrc_machine)
814
815         return username, password
816
817     def _get_tfa_info(self, note='two-factor verification code'):
818         """
819         Get the two-factor authentication info
820         TODO - asking the user will be required for sms/phone verify
821         currently just uses the command line option
822         If there's no info available, return None
823         """
824         if self._downloader is None:
825             return None
826         downloader_params = self._downloader.params
827
828         if downloader_params.get('twofactor') is not None:
829             return downloader_params['twofactor']
830
831         return compat_getpass('Type %s and press [Return]: ' % note)
832
833     # Helper functions for extracting OpenGraph info
834     @staticmethod
835     def _og_regexes(prop):
836         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
837         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
838                        % {'prop': re.escape(prop)})
839         template = r'<meta[^>]+?%s[^>]+?%s'
840         return [
841             template % (property_re, content_re),
842             template % (content_re, property_re),
843         ]
844
845     @staticmethod
846     def _meta_regex(prop):
847         return r'''(?isx)<meta
848                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
849                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
850
851     def _og_search_property(self, prop, html, name=None, **kargs):
852         if not isinstance(prop, (list, tuple)):
853             prop = [prop]
854         if name is None:
855             name = 'OpenGraph %s' % prop[0]
856         og_regexes = []
857         for p in prop:
858             og_regexes.extend(self._og_regexes(p))
859         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
860         if escaped is None:
861             return None
862         return unescapeHTML(escaped)
863
864     def _og_search_thumbnail(self, html, **kargs):
865         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
866
867     def _og_search_description(self, html, **kargs):
868         return self._og_search_property('description', html, fatal=False, **kargs)
869
870     def _og_search_title(self, html, **kargs):
871         return self._og_search_property('title', html, **kargs)
872
873     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
874         regexes = self._og_regexes('video') + self._og_regexes('video:url')
875         if secure:
876             regexes = self._og_regexes('video:secure_url') + regexes
877         return self._html_search_regex(regexes, html, name, **kargs)
878
879     def _og_search_url(self, html, **kargs):
880         return self._og_search_property('url', html, **kargs)
881
882     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
883         if not isinstance(name, (list, tuple)):
884             name = [name]
885         if display_name is None:
886             display_name = name[0]
887         return self._html_search_regex(
888             [self._meta_regex(n) for n in name],
889             html, display_name, fatal=fatal, group='content', **kwargs)
890
891     def _dc_search_uploader(self, html):
892         return self._html_search_meta('dc.creator', html, 'uploader')
893
894     def _rta_search(self, html):
895         # See http://www.rtalabel.org/index.php?content=howtofaq#single
896         if re.search(r'(?ix)<meta\s+name="rating"\s+'
897                      r'     content="RTA-5042-1996-1400-1577-RTA"',
898                      html):
899             return 18
900         return 0
901
902     def _media_rating_search(self, html):
903         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
904         rating = self._html_search_meta('rating', html)
905
906         if not rating:
907             return None
908
909         RATING_TABLE = {
910             'safe for kids': 0,
911             'general': 8,
912             '14 years': 14,
913             'mature': 17,
914             'restricted': 19,
915         }
916         return RATING_TABLE.get(rating.lower())
917
918     def _family_friendly_search(self, html):
919         # See http://schema.org/VideoObject
920         family_friendly = self._html_search_meta('isFamilyFriendly', html)
921
922         if not family_friendly:
923             return None
924
925         RATING_TABLE = {
926             '1': 0,
927             'true': 0,
928             '0': 18,
929             'false': 18,
930         }
931         return RATING_TABLE.get(family_friendly.lower())
932
933     def _twitter_search_player(self, html):
934         return self._html_search_meta('twitter:player', html,
935                                       'twitter card player')
936
937     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
938         json_ld = self._search_regex(
939             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
940             html, 'JSON-LD', group='json_ld', **kwargs)
941         default = kwargs.get('default', NO_DEFAULT)
942         if not json_ld:
943             return default if default is not NO_DEFAULT else {}
944         # JSON-LD may be malformed and thus `fatal` should be respected.
945         # At the same time `default` may be passed that assumes `fatal=False`
946         # for _search_regex. Let's simulate the same behavior here as well.
947         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
948         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
949
950     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
951         if isinstance(json_ld, compat_str):
952             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
953         if not json_ld:
954             return {}
955         info = {}
956         if not isinstance(json_ld, (list, tuple, dict)):
957             return info
958         if isinstance(json_ld, dict):
959             json_ld = [json_ld]
960         for e in json_ld:
961             if e.get('@context') == 'http://schema.org':
962                 item_type = e.get('@type')
963                 if expected_type is not None and expected_type != item_type:
964                     return info
965                 if item_type == 'TVEpisode':
966                     info.update({
967                         'episode': unescapeHTML(e.get('name')),
968                         'episode_number': int_or_none(e.get('episodeNumber')),
969                         'description': unescapeHTML(e.get('description')),
970                     })
971                     part_of_season = e.get('partOfSeason')
972                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
973                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
974                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
975                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
976                         info['series'] = unescapeHTML(part_of_series.get('name'))
977                 elif item_type == 'Article':
978                     info.update({
979                         'timestamp': parse_iso8601(e.get('datePublished')),
980                         'title': unescapeHTML(e.get('headline')),
981                         'description': unescapeHTML(e.get('articleBody')),
982                     })
983                 elif item_type == 'VideoObject':
984                     info.update({
985                         'url': e.get('contentUrl'),
986                         'title': unescapeHTML(e.get('name')),
987                         'description': unescapeHTML(e.get('description')),
988                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
989                         'duration': parse_duration(e.get('duration')),
990                         'timestamp': unified_timestamp(e.get('uploadDate')),
991                         'filesize': float_or_none(e.get('contentSize')),
992                         'tbr': int_or_none(e.get('bitrate')),
993                         'width': int_or_none(e.get('width')),
994                         'height': int_or_none(e.get('height')),
995                     })
996                 break
997         return dict((k, v) for k, v in info.items() if v is not None)
998
999     @staticmethod
1000     def _hidden_inputs(html):
1001         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1002         hidden_inputs = {}
1003         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1004             attrs = extract_attributes(input)
1005             if not input:
1006                 continue
1007             if attrs.get('type') not in ('hidden', 'submit'):
1008                 continue
1009             name = attrs.get('name') or attrs.get('id')
1010             value = attrs.get('value')
1011             if name and value is not None:
1012                 hidden_inputs[name] = value
1013         return hidden_inputs
1014
1015     def _form_hidden_inputs(self, form_id, html):
1016         form = self._search_regex(
1017             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1018             html, '%s form' % form_id, group='form')
1019         return self._hidden_inputs(form)
1020
1021     def _sort_formats(self, formats, field_preference=None):
1022         if not formats:
1023             raise ExtractorError('No video formats found')
1024
1025         for f in formats:
1026             # Automatically determine tbr when missing based on abr and vbr (improves
1027             # formats sorting in some cases)
1028             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1029                 f['tbr'] = f['abr'] + f['vbr']
1030
1031         def _formats_key(f):
1032             # TODO remove the following workaround
1033             from ..utils import determine_ext
1034             if not f.get('ext') and 'url' in f:
1035                 f['ext'] = determine_ext(f['url'])
1036
1037             if isinstance(field_preference, (list, tuple)):
1038                 return tuple(
1039                     f.get(field)
1040                     if f.get(field) is not None
1041                     else ('' if field == 'format_id' else -1)
1042                     for field in field_preference)
1043
1044             preference = f.get('preference')
1045             if preference is None:
1046                 preference = 0
1047                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1048                     preference -= 0.5
1049
1050             protocol = f.get('protocol') or determine_protocol(f)
1051             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1052
1053             if f.get('vcodec') == 'none':  # audio only
1054                 preference -= 50
1055                 if self._downloader.params.get('prefer_free_formats'):
1056                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1057                 else:
1058                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1059                 ext_preference = 0
1060                 try:
1061                     audio_ext_preference = ORDER.index(f['ext'])
1062                 except ValueError:
1063                     audio_ext_preference = -1
1064             else:
1065                 if f.get('acodec') == 'none':  # video only
1066                     preference -= 40
1067                 if self._downloader.params.get('prefer_free_formats'):
1068                     ORDER = ['flv', 'mp4', 'webm']
1069                 else:
1070                     ORDER = ['webm', 'flv', 'mp4']
1071                 try:
1072                     ext_preference = ORDER.index(f['ext'])
1073                 except ValueError:
1074                     ext_preference = -1
1075                 audio_ext_preference = 0
1076
1077             return (
1078                 preference,
1079                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1080                 f.get('quality') if f.get('quality') is not None else -1,
1081                 f.get('tbr') if f.get('tbr') is not None else -1,
1082                 f.get('filesize') if f.get('filesize') is not None else -1,
1083                 f.get('vbr') if f.get('vbr') is not None else -1,
1084                 f.get('height') if f.get('height') is not None else -1,
1085                 f.get('width') if f.get('width') is not None else -1,
1086                 proto_preference,
1087                 ext_preference,
1088                 f.get('abr') if f.get('abr') is not None else -1,
1089                 audio_ext_preference,
1090                 f.get('fps') if f.get('fps') is not None else -1,
1091                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1092                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1093                 f.get('format_id') if f.get('format_id') is not None else '',
1094             )
1095         formats.sort(key=_formats_key)
1096
1097     def _check_formats(self, formats, video_id):
1098         if formats:
1099             formats[:] = filter(
1100                 lambda f: self._is_valid_url(
1101                     f['url'], video_id,
1102                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1103                 formats)
1104
1105     @staticmethod
1106     def _remove_duplicate_formats(formats):
1107         format_urls = set()
1108         unique_formats = []
1109         for f in formats:
1110             if f['url'] not in format_urls:
1111                 format_urls.add(f['url'])
1112                 unique_formats.append(f)
1113         formats[:] = unique_formats
1114
1115     def _is_valid_url(self, url, video_id, item='video', headers={}):
1116         url = self._proto_relative_url(url, scheme='http:')
1117         # For now assume non HTTP(S) URLs always valid
1118         if not (url.startswith('http://') or url.startswith('https://')):
1119             return True
1120         try:
1121             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1122             return True
1123         except ExtractorError as e:
1124             if isinstance(e.cause, compat_urllib_error.URLError):
1125                 self.to_screen(
1126                     '%s: %s URL is invalid, skipping' % (video_id, item))
1127                 return False
1128             raise
1129
1130     def http_scheme(self):
1131         """ Either "http:" or "https:", depending on the user's preferences """
1132         return (
1133             'http:'
1134             if self._downloader.params.get('prefer_insecure', False)
1135             else 'https:')
1136
1137     def _proto_relative_url(self, url, scheme=None):
1138         if url is None:
1139             return url
1140         if url.startswith('//'):
1141             if scheme is None:
1142                 scheme = self.http_scheme()
1143             return scheme + url
1144         else:
1145             return url
1146
1147     def _sleep(self, timeout, video_id, msg_template=None):
1148         if msg_template is None:
1149             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1150         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1151         self.to_screen(msg)
1152         time.sleep(timeout)
1153
1154     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1155                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1156                              fatal=True, m3u8_id=None):
1157         manifest = self._download_xml(
1158             manifest_url, video_id, 'Downloading f4m manifest',
1159             'Unable to download f4m manifest',
1160             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1161             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1162             transform_source=transform_source,
1163             fatal=fatal)
1164
1165         if manifest is False:
1166             return []
1167
1168         return self._parse_f4m_formats(
1169             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1170             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1171
1172     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1173                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1174                            fatal=True, m3u8_id=None):
1175         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1176         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1177         if akamai_pv is not None and ';' in akamai_pv.text:
1178             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1179             if playerVerificationChallenge.strip() != '':
1180                 return []
1181
1182         formats = []
1183         manifest_version = '1.0'
1184         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1185         if not media_nodes:
1186             manifest_version = '2.0'
1187             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1188         # Remove unsupported DRM protected media from final formats
1189         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1190         media_nodes = remove_encrypted_media(media_nodes)
1191         if not media_nodes:
1192             return formats
1193         base_url = xpath_text(
1194             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1195             'base URL', default=None)
1196         if base_url:
1197             base_url = base_url.strip()
1198
1199         bootstrap_info = xpath_element(
1200             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1201             'bootstrap info', default=None)
1202
1203         vcodec = None
1204         mime_type = xpath_text(
1205             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1206             'base URL', default=None)
1207         if mime_type and mime_type.startswith('audio/'):
1208             vcodec = 'none'
1209
1210         for i, media_el in enumerate(media_nodes):
1211             tbr = int_or_none(media_el.attrib.get('bitrate'))
1212             width = int_or_none(media_el.attrib.get('width'))
1213             height = int_or_none(media_el.attrib.get('height'))
1214             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1215             # If <bootstrapInfo> is present, the specified f4m is a
1216             # stream-level manifest, and only set-level manifests may refer to
1217             # external resources.  See section 11.4 and section 4 of F4M spec
1218             if bootstrap_info is None:
1219                 media_url = None
1220                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1221                 if manifest_version == '2.0':
1222                     media_url = media_el.attrib.get('href')
1223                 if media_url is None:
1224                     media_url = media_el.attrib.get('url')
1225                 if not media_url:
1226                     continue
1227                 manifest_url = (
1228                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1229                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1230                 # If media_url is itself a f4m manifest do the recursive extraction
1231                 # since bitrates in parent manifest (this one) and media_url manifest
1232                 # may differ leading to inability to resolve the format by requested
1233                 # bitrate in f4m downloader
1234                 ext = determine_ext(manifest_url)
1235                 if ext == 'f4m':
1236                     f4m_formats = self._extract_f4m_formats(
1237                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1238                         transform_source=transform_source, fatal=fatal)
1239                     # Sometimes stream-level manifest contains single media entry that
1240                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1241                     # At the same time parent's media entry in set-level manifest may
1242                     # contain it. We will copy it from parent in such cases.
1243                     if len(f4m_formats) == 1:
1244                         f = f4m_formats[0]
1245                         f.update({
1246                             'tbr': f.get('tbr') or tbr,
1247                             'width': f.get('width') or width,
1248                             'height': f.get('height') or height,
1249                             'format_id': f.get('format_id') if not tbr else format_id,
1250                             'vcodec': vcodec,
1251                         })
1252                     formats.extend(f4m_formats)
1253                     continue
1254                 elif ext == 'm3u8':
1255                     formats.extend(self._extract_m3u8_formats(
1256                         manifest_url, video_id, 'mp4', preference=preference,
1257                         m3u8_id=m3u8_id, fatal=fatal))
1258                     continue
1259             formats.append({
1260                 'format_id': format_id,
1261                 'url': manifest_url,
1262                 'manifest_url': manifest_url,
1263                 'ext': 'flv' if bootstrap_info is not None else None,
1264                 'tbr': tbr,
1265                 'width': width,
1266                 'height': height,
1267                 'vcodec': vcodec,
1268                 'preference': preference,
1269             })
1270         return formats
1271
1272     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1273         return {
1274             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1275             'url': m3u8_url,
1276             'ext': ext,
1277             'protocol': 'm3u8',
1278             'preference': preference - 100 if preference else -100,
1279             'resolution': 'multiple',
1280             'format_note': 'Quality selection URL',
1281         }
1282
1283     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1284                               entry_protocol='m3u8', preference=None,
1285                               m3u8_id=None, note=None, errnote=None,
1286                               fatal=True, live=False):
1287
1288         res = self._download_webpage_handle(
1289             m3u8_url, video_id,
1290             note=note or 'Downloading m3u8 information',
1291             errnote=errnote or 'Failed to download m3u8 information',
1292             fatal=fatal)
1293         if res is False:
1294             return []
1295         m3u8_doc, urlh = res
1296         m3u8_url = urlh.geturl()
1297
1298         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1299             return []
1300
1301         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1302
1303         format_url = lambda u: (
1304             u
1305             if re.match(r'^https?://', u)
1306             else compat_urlparse.urljoin(m3u8_url, u))
1307
1308         # We should try extracting formats only from master playlists [1], i.e.
1309         # playlists that describe available qualities. On the other hand media
1310         # playlists [2] should be returned as is since they contain just the media
1311         # without qualities renditions.
1312         # Fortunately, master playlist can be easily distinguished from media
1313         # playlist based on particular tags availability. As of [1, 2] master
1314         # playlist tags MUST NOT appear in a media playist and vice versa.
1315         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1316         # and MUST NOT appear in master playlist thus we can clearly detect media
1317         # playlist with this criterion.
1318         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1319         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1320         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1321         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1322             return [{
1323                 'url': m3u8_url,
1324                 'format_id': m3u8_id,
1325                 'ext': ext,
1326                 'protocol': entry_protocol,
1327                 'preference': preference,
1328             }]
1329         audio_in_video_stream = {}
1330         last_info = {}
1331         last_media = {}
1332         for line in m3u8_doc.splitlines():
1333             if line.startswith('#EXT-X-STREAM-INF:'):
1334                 last_info = parse_m3u8_attributes(line)
1335             elif line.startswith('#EXT-X-MEDIA:'):
1336                 media = parse_m3u8_attributes(line)
1337                 media_type = media.get('TYPE')
1338                 if media_type in ('VIDEO', 'AUDIO'):
1339                     group_id = media.get('GROUP-ID')
1340                     media_url = media.get('URI')
1341                     if media_url:
1342                         format_id = []
1343                         for v in (group_id, media.get('NAME')):
1344                             if v:
1345                                 format_id.append(v)
1346                         f = {
1347                             'format_id': '-'.join(format_id),
1348                             'url': format_url(media_url),
1349                             'language': media.get('LANGUAGE'),
1350                             'ext': ext,
1351                             'protocol': entry_protocol,
1352                             'preference': preference,
1353                         }
1354                         if media_type == 'AUDIO':
1355                             f['vcodec'] = 'none'
1356                             if group_id and not audio_in_video_stream.get(group_id):
1357                                 audio_in_video_stream[group_id] = False
1358                         formats.append(f)
1359                     else:
1360                         # When there is no URI in EXT-X-MEDIA let this tag's
1361                         # data be used by regular URI lines below
1362                         last_media = media
1363                         if media_type == 'AUDIO' and group_id:
1364                             audio_in_video_stream[group_id] = True
1365             elif line.startswith('#') or not line.strip():
1366                 continue
1367             else:
1368                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1369                 format_id = []
1370                 if m3u8_id:
1371                     format_id.append(m3u8_id)
1372                 # Despite specification does not mention NAME attribute for
1373                 # EXT-X-STREAM-INF it still sometimes may be present
1374                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1375                 # Bandwidth of live streams may differ over time thus making
1376                 # format_id unpredictable. So it's better to keep provided
1377                 # format_id intact.
1378                 if not live:
1379                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1380                 manifest_url = format_url(line.strip())
1381                 f = {
1382                     'format_id': '-'.join(format_id),
1383                     'url': manifest_url,
1384                     'manifest_url': manifest_url,
1385                     'tbr': tbr,
1386                     'ext': ext,
1387                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1388                     'protocol': entry_protocol,
1389                     'preference': preference,
1390                 }
1391                 resolution = last_info.get('RESOLUTION')
1392                 if resolution:
1393                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1394                     if mobj:
1395                         f['width'] = int(mobj.group('width'))
1396                         f['height'] = int(mobj.group('height'))
1397                 # Unified Streaming Platform
1398                 mobj = re.search(
1399                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1400                 if mobj:
1401                     abr, vbr = mobj.groups()
1402                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1403                     f.update({
1404                         'vbr': vbr,
1405                         'abr': abr,
1406                     })
1407                 f.update(parse_codecs(last_info.get('CODECS')))
1408                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1409                     # TODO: update acodec for audio only formats with the same GROUP-ID
1410                     f['acodec'] = 'none'
1411                 formats.append(f)
1412                 last_info = {}
1413                 last_media = {}
1414         return formats
1415
1416     @staticmethod
1417     def _xpath_ns(path, namespace=None):
1418         if not namespace:
1419             return path
1420         out = []
1421         for c in path.split('/'):
1422             if not c or c == '.':
1423                 out.append(c)
1424             else:
1425                 out.append('{%s}%s' % (namespace, c))
1426         return '/'.join(out)
1427
1428     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1429         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1430
1431         if smil is False:
1432             assert not fatal
1433             return []
1434
1435         namespace = self._parse_smil_namespace(smil)
1436
1437         return self._parse_smil_formats(
1438             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1439
1440     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1441         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1442         if smil is False:
1443             return {}
1444         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1445
1446     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1447         return self._download_xml(
1448             smil_url, video_id, 'Downloading SMIL file',
1449             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1450
1451     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1452         namespace = self._parse_smil_namespace(smil)
1453
1454         formats = self._parse_smil_formats(
1455             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1456         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1457
1458         video_id = os.path.splitext(url_basename(smil_url))[0]
1459         title = None
1460         description = None
1461         upload_date = None
1462         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1463             name = meta.attrib.get('name')
1464             content = meta.attrib.get('content')
1465             if not name or not content:
1466                 continue
1467             if not title and name == 'title':
1468                 title = content
1469             elif not description and name in ('description', 'abstract'):
1470                 description = content
1471             elif not upload_date and name == 'date':
1472                 upload_date = unified_strdate(content)
1473
1474         thumbnails = [{
1475             'id': image.get('type'),
1476             'url': image.get('src'),
1477             'width': int_or_none(image.get('width')),
1478             'height': int_or_none(image.get('height')),
1479         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1480
1481         return {
1482             'id': video_id,
1483             'title': title or video_id,
1484             'description': description,
1485             'upload_date': upload_date,
1486             'thumbnails': thumbnails,
1487             'formats': formats,
1488             'subtitles': subtitles,
1489         }
1490
1491     def _parse_smil_namespace(self, smil):
1492         return self._search_regex(
1493             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1494
1495     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1496         base = smil_url
1497         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1498             b = meta.get('base') or meta.get('httpBase')
1499             if b:
1500                 base = b
1501                 break
1502
1503         formats = []
1504         rtmp_count = 0
1505         http_count = 0
1506         m3u8_count = 0
1507
1508         srcs = []
1509         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1510         for medium in media:
1511             src = medium.get('src')
1512             if not src or src in srcs:
1513                 continue
1514             srcs.append(src)
1515
1516             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1517             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1518             width = int_or_none(medium.get('width'))
1519             height = int_or_none(medium.get('height'))
1520             proto = medium.get('proto')
1521             ext = medium.get('ext')
1522             src_ext = determine_ext(src)
1523             streamer = medium.get('streamer') or base
1524
1525             if proto == 'rtmp' or streamer.startswith('rtmp'):
1526                 rtmp_count += 1
1527                 formats.append({
1528                     'url': streamer,
1529                     'play_path': src,
1530                     'ext': 'flv',
1531                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1532                     'tbr': bitrate,
1533                     'filesize': filesize,
1534                     'width': width,
1535                     'height': height,
1536                 })
1537                 if transform_rtmp_url:
1538                     streamer, src = transform_rtmp_url(streamer, src)
1539                     formats[-1].update({
1540                         'url': streamer,
1541                         'play_path': src,
1542                     })
1543                 continue
1544
1545             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1546             src_url = src_url.strip()
1547
1548             if proto == 'm3u8' or src_ext == 'm3u8':
1549                 m3u8_formats = self._extract_m3u8_formats(
1550                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1551                 if len(m3u8_formats) == 1:
1552                     m3u8_count += 1
1553                     m3u8_formats[0].update({
1554                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1555                         'tbr': bitrate,
1556                         'width': width,
1557                         'height': height,
1558                     })
1559                 formats.extend(m3u8_formats)
1560                 continue
1561
1562             if src_ext == 'f4m':
1563                 f4m_url = src_url
1564                 if not f4m_params:
1565                     f4m_params = {
1566                         'hdcore': '3.2.0',
1567                         'plugin': 'flowplayer-3.2.0.1',
1568                     }
1569                 f4m_url += '&' if '?' in f4m_url else '?'
1570                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1571                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1572                 continue
1573
1574             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1575                 http_count += 1
1576                 formats.append({
1577                     'url': src_url,
1578                     'ext': ext or src_ext or 'flv',
1579                     'format_id': 'http-%d' % (bitrate or http_count),
1580                     'tbr': bitrate,
1581                     'filesize': filesize,
1582                     'width': width,
1583                     'height': height,
1584                 })
1585                 continue
1586
1587         return formats
1588
1589     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1590         urls = []
1591         subtitles = {}
1592         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1593             src = textstream.get('src')
1594             if not src or src in urls:
1595                 continue
1596             urls.append(src)
1597             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1598             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1599             subtitles.setdefault(lang, []).append({
1600                 'url': src,
1601                 'ext': ext,
1602             })
1603         return subtitles
1604
1605     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1606         xspf = self._download_xml(
1607             playlist_url, playlist_id, 'Downloading xpsf playlist',
1608             'Unable to download xspf manifest', fatal=fatal)
1609         if xspf is False:
1610             return []
1611         return self._parse_xspf(xspf, playlist_id)
1612
1613     def _parse_xspf(self, playlist, playlist_id):
1614         NS_MAP = {
1615             'xspf': 'http://xspf.org/ns/0/',
1616             's1': 'http://static.streamone.nl/player/ns/0',
1617         }
1618
1619         entries = []
1620         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1621             title = xpath_text(
1622                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1623             description = xpath_text(
1624                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1625             thumbnail = xpath_text(
1626                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1627             duration = float_or_none(
1628                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1629
1630             formats = [{
1631                 'url': location.text,
1632                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1633                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1634                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1635             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1636             self._sort_formats(formats)
1637
1638             entries.append({
1639                 'id': playlist_id,
1640                 'title': title,
1641                 'description': description,
1642                 'thumbnail': thumbnail,
1643                 'duration': duration,
1644                 'formats': formats,
1645             })
1646         return entries
1647
1648     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1649         res = self._download_webpage_handle(
1650             mpd_url, video_id,
1651             note=note or 'Downloading MPD manifest',
1652             errnote=errnote or 'Failed to download MPD manifest',
1653             fatal=fatal)
1654         if res is False:
1655             return []
1656         mpd, urlh = res
1657         mpd_base_url = base_url(urlh.geturl())
1658
1659         return self._parse_mpd_formats(
1660             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1661             formats_dict=formats_dict, mpd_url=mpd_url)
1662
1663     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1664         """
1665         Parse formats from MPD manifest.
1666         References:
1667          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1668             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1669          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1670         """
1671         if mpd_doc.get('type') == 'dynamic':
1672             return []
1673
1674         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1675
1676         def _add_ns(path):
1677             return self._xpath_ns(path, namespace)
1678
1679         def is_drm_protected(element):
1680             return element.find(_add_ns('ContentProtection')) is not None
1681
1682         def extract_multisegment_info(element, ms_parent_info):
1683             ms_info = ms_parent_info.copy()
1684
1685             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1686             # common attributes and elements.  We will only extract relevant
1687             # for us.
1688             def extract_common(source):
1689                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1690                 if segment_timeline is not None:
1691                     s_e = segment_timeline.findall(_add_ns('S'))
1692                     if s_e:
1693                         ms_info['total_number'] = 0
1694                         ms_info['s'] = []
1695                         for s in s_e:
1696                             r = int(s.get('r', 0))
1697                             ms_info['total_number'] += 1 + r
1698                             ms_info['s'].append({
1699                                 't': int(s.get('t', 0)),
1700                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1701                                 'd': int(s.attrib['d']),
1702                                 'r': r,
1703                             })
1704                 start_number = source.get('startNumber')
1705                 if start_number:
1706                     ms_info['start_number'] = int(start_number)
1707                 timescale = source.get('timescale')
1708                 if timescale:
1709                     ms_info['timescale'] = int(timescale)
1710                 segment_duration = source.get('duration')
1711                 if segment_duration:
1712                     ms_info['segment_duration'] = int(segment_duration)
1713
1714             def extract_Initialization(source):
1715                 initialization = source.find(_add_ns('Initialization'))
1716                 if initialization is not None:
1717                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1718
1719             segment_list = element.find(_add_ns('SegmentList'))
1720             if segment_list is not None:
1721                 extract_common(segment_list)
1722                 extract_Initialization(segment_list)
1723                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1724                 if segment_urls_e:
1725                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1726             else:
1727                 segment_template = element.find(_add_ns('SegmentTemplate'))
1728                 if segment_template is not None:
1729                     extract_common(segment_template)
1730                     media = segment_template.get('media')
1731                     if media:
1732                         ms_info['media'] = media
1733                     initialization = segment_template.get('initialization')
1734                     if initialization:
1735                         ms_info['initialization'] = initialization
1736                     else:
1737                         extract_Initialization(segment_template)
1738             return ms_info
1739
1740         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1741         formats = []
1742         for period in mpd_doc.findall(_add_ns('Period')):
1743             period_duration = parse_duration(period.get('duration')) or mpd_duration
1744             period_ms_info = extract_multisegment_info(period, {
1745                 'start_number': 1,
1746                 'timescale': 1,
1747             })
1748             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1749                 if is_drm_protected(adaptation_set):
1750                     continue
1751                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1752                 for representation in adaptation_set.findall(_add_ns('Representation')):
1753                     if is_drm_protected(representation):
1754                         continue
1755                     representation_attrib = adaptation_set.attrib.copy()
1756                     representation_attrib.update(representation.attrib)
1757                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1758                     mime_type = representation_attrib['mimeType']
1759                     content_type = mime_type.split('/')[0]
1760                     if content_type == 'text':
1761                         # TODO implement WebVTT downloading
1762                         pass
1763                     elif content_type == 'video' or content_type == 'audio':
1764                         base_url = ''
1765                         for element in (representation, adaptation_set, period, mpd_doc):
1766                             base_url_e = element.find(_add_ns('BaseURL'))
1767                             if base_url_e is not None:
1768                                 base_url = base_url_e.text + base_url
1769                                 if re.match(r'^https?://', base_url):
1770                                     break
1771                         if mpd_base_url and not re.match(r'^https?://', base_url):
1772                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1773                                 mpd_base_url += '/'
1774                             base_url = mpd_base_url + base_url
1775                         representation_id = representation_attrib.get('id')
1776                         lang = representation_attrib.get('lang')
1777                         url_el = representation.find(_add_ns('BaseURL'))
1778                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1779                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1780                         f = {
1781                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1782                             'url': base_url,
1783                             'manifest_url': mpd_url,
1784                             'ext': mimetype2ext(mime_type),
1785                             'width': int_or_none(representation_attrib.get('width')),
1786                             'height': int_or_none(representation_attrib.get('height')),
1787                             'tbr': int_or_none(bandwidth, 1000),
1788                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1789                             'fps': int_or_none(representation_attrib.get('frameRate')),
1790                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1791                             'format_note': 'DASH %s' % content_type,
1792                             'filesize': filesize,
1793                         }
1794                         f.update(parse_codecs(representation_attrib.get('codecs')))
1795                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1796
1797                         def prepare_template(template_name, identifiers):
1798                             t = representation_ms_info[template_name]
1799                             t = t.replace('$RepresentationID$', representation_id)
1800                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1801                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1802                             t.replace('$$', '$')
1803                             return t
1804
1805                         # @initialization is a regular template like @media one
1806                         # so it should be handled just the same way (see
1807                         # https://github.com/rg3/youtube-dl/issues/11605)
1808                         if 'initialization' in representation_ms_info:
1809                             initialization_template = prepare_template(
1810                                 'initialization',
1811                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1812                                 # $Time$ shall not be included for @initialization thus
1813                                 # only $Bandwidth$ remains
1814                                 ('Bandwidth', ))
1815                             representation_ms_info['initialization_url'] = initialization_template % {
1816                                 'Bandwidth': bandwidth,
1817                             }
1818
1819                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1820
1821                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1822
1823                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1824                             # can't be used at the same time
1825                             if '%(Number' in media_template and 's' not in representation_ms_info:
1826                                 segment_duration = None
1827                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1828                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1829                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1830                                 representation_ms_info['fragments'] = [{
1831                                     'url': media_template % {
1832                                         'Number': segment_number,
1833                                         'Bandwidth': bandwidth,
1834                                     },
1835                                     'duration': segment_duration,
1836                                 } for segment_number in range(
1837                                     representation_ms_info['start_number'],
1838                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1839                             else:
1840                                 # $Number*$ or $Time$ in media template with S list available
1841                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1842                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1843                                 representation_ms_info['fragments'] = []
1844                                 segment_time = 0
1845                                 segment_d = None
1846                                 segment_number = representation_ms_info['start_number']
1847
1848                                 def add_segment_url():
1849                                     segment_url = media_template % {
1850                                         'Time': segment_time,
1851                                         'Bandwidth': bandwidth,
1852                                         'Number': segment_number,
1853                                     }
1854                                     representation_ms_info['fragments'].append({
1855                                         'url': segment_url,
1856                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1857                                     })
1858
1859                                 for num, s in enumerate(representation_ms_info['s']):
1860                                     segment_time = s.get('t') or segment_time
1861                                     segment_d = s['d']
1862                                     add_segment_url()
1863                                     segment_number += 1
1864                                     for r in range(s.get('r', 0)):
1865                                         segment_time += segment_d
1866                                         add_segment_url()
1867                                         segment_number += 1
1868                                     segment_time += segment_d
1869                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1870                             # No media template
1871                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1872                             # or any YouTube dashsegments video
1873                             fragments = []
1874                             segment_index = 0
1875                             timescale = representation_ms_info['timescale']
1876                             for s in representation_ms_info['s']:
1877                                 duration = float_or_none(s['d'], timescale)
1878                                 for r in range(s.get('r', 0) + 1):
1879                                     fragments.append({
1880                                         'url': representation_ms_info['segment_urls'][segment_index],
1881                                         'duration': duration,
1882                                     })
1883                                     segment_index += 1
1884                             representation_ms_info['fragments'] = fragments
1885                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1886                         # No fragments key is present in this case.
1887                         if 'fragments' in representation_ms_info:
1888                             f.update({
1889                                 'fragments': [],
1890                                 'protocol': 'http_dash_segments',
1891                             })
1892                             if 'initialization_url' in representation_ms_info:
1893                                 initialization_url = representation_ms_info['initialization_url']
1894                                 if not f.get('url'):
1895                                     f['url'] = initialization_url
1896                                 f['fragments'].append({'url': initialization_url})
1897                             f['fragments'].extend(representation_ms_info['fragments'])
1898                             for fragment in f['fragments']:
1899                                 fragment['url'] = urljoin(base_url, fragment['url'])
1900                         try:
1901                             existing_format = next(
1902                                 fo for fo in formats
1903                                 if fo['format_id'] == representation_id)
1904                         except StopIteration:
1905                             full_info = formats_dict.get(representation_id, {}).copy()
1906                             full_info.update(f)
1907                             formats.append(full_info)
1908                         else:
1909                             existing_format.update(f)
1910                     else:
1911                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1912         return formats
1913
1914     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1915         res = self._download_webpage_handle(
1916             ism_url, video_id,
1917             note=note or 'Downloading ISM manifest',
1918             errnote=errnote or 'Failed to download ISM manifest',
1919             fatal=fatal)
1920         if res is False:
1921             return []
1922         ism, urlh = res
1923
1924         return self._parse_ism_formats(
1925             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1926
1927     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1928         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1929             return []
1930
1931         duration = int(ism_doc.attrib['Duration'])
1932         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1933
1934         formats = []
1935         for stream in ism_doc.findall('StreamIndex'):
1936             stream_type = stream.get('Type')
1937             if stream_type not in ('video', 'audio'):
1938                 continue
1939             url_pattern = stream.attrib['Url']
1940             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1941             stream_name = stream.get('Name')
1942             for track in stream.findall('QualityLevel'):
1943                 fourcc = track.get('FourCC')
1944                 # TODO: add support for WVC1 and WMAP
1945                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1946                     self.report_warning('%s is not a supported codec' % fourcc)
1947                     continue
1948                 tbr = int(track.attrib['Bitrate']) // 1000
1949                 width = int_or_none(track.get('MaxWidth'))
1950                 height = int_or_none(track.get('MaxHeight'))
1951                 sampling_rate = int_or_none(track.get('SamplingRate'))
1952
1953                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1954                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1955
1956                 fragments = []
1957                 fragment_ctx = {
1958                     'time': 0,
1959                 }
1960                 stream_fragments = stream.findall('c')
1961                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1962                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1963                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1964                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1965                     if not fragment_ctx['duration']:
1966                         try:
1967                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1968                         except IndexError:
1969                             next_fragment_time = duration
1970                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1971                     for _ in range(fragment_repeat):
1972                         fragments.append({
1973                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1974                             'duration': fragment_ctx['duration'] / stream_timescale,
1975                         })
1976                         fragment_ctx['time'] += fragment_ctx['duration']
1977
1978                 format_id = []
1979                 if ism_id:
1980                     format_id.append(ism_id)
1981                 if stream_name:
1982                     format_id.append(stream_name)
1983                 format_id.append(compat_str(tbr))
1984
1985                 formats.append({
1986                     'format_id': '-'.join(format_id),
1987                     'url': ism_url,
1988                     'manifest_url': ism_url,
1989                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1990                     'width': width,
1991                     'height': height,
1992                     'tbr': tbr,
1993                     'asr': sampling_rate,
1994                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1995                     'acodec': 'none' if stream_type == 'video' else fourcc,
1996                     'protocol': 'ism',
1997                     'fragments': fragments,
1998                     '_download_params': {
1999                         'duration': duration,
2000                         'timescale': stream_timescale,
2001                         'width': width or 0,
2002                         'height': height or 0,
2003                         'fourcc': fourcc,
2004                         'codec_private_data': track.get('CodecPrivateData'),
2005                         'sampling_rate': sampling_rate,
2006                         'channels': int_or_none(track.get('Channels', 2)),
2007                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2008                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2009                     },
2010                 })
2011         return formats
2012
2013     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
2014         def absolute_url(video_url):
2015             return compat_urlparse.urljoin(base_url, video_url)
2016
2017         def parse_content_type(content_type):
2018             if not content_type:
2019                 return {}
2020             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2021             if ctr:
2022                 mimetype, codecs = ctr.groups()
2023                 f = parse_codecs(codecs)
2024                 f['ext'] = mimetype2ext(mimetype)
2025                 return f
2026             return {}
2027
2028         def _media_formats(src, cur_media_type):
2029             full_url = absolute_url(src)
2030             ext = determine_ext(full_url)
2031             if ext == 'm3u8':
2032                 is_plain_url = False
2033                 formats = self._extract_m3u8_formats(
2034                     full_url, video_id, ext='mp4',
2035                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
2036             elif ext == 'mpd':
2037                 is_plain_url = False
2038                 formats = self._extract_mpd_formats(
2039                     full_url, video_id, mpd_id=mpd_id)
2040             else:
2041                 is_plain_url = True
2042                 formats = [{
2043                     'url': full_url,
2044                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2045                 }]
2046             return is_plain_url, formats
2047
2048         entries = []
2049         media_tags = [(media_tag, media_type, '')
2050                       for media_tag, media_type
2051                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2052         media_tags.extend(re.findall(
2053             # We only allow video|audio followed by a whitespace or '>'.
2054             # Allowing more characters may end up in significant slow down (see
2055             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2056             # http://www.porntrex.com/maps/videositemap.xml).
2057             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2058         for media_tag, media_type, media_content in media_tags:
2059             media_info = {
2060                 'formats': [],
2061                 'subtitles': {},
2062             }
2063             media_attributes = extract_attributes(media_tag)
2064             src = media_attributes.get('src')
2065             if src:
2066                 _, formats = _media_formats(src, media_type)
2067                 media_info['formats'].extend(formats)
2068             media_info['thumbnail'] = media_attributes.get('poster')
2069             if media_content:
2070                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2071                     source_attributes = extract_attributes(source_tag)
2072                     src = source_attributes.get('src')
2073                     if not src:
2074                         continue
2075                     is_plain_url, formats = _media_formats(src, media_type)
2076                     if is_plain_url:
2077                         f = parse_content_type(source_attributes.get('type'))
2078                         f.update(formats[0])
2079                         media_info['formats'].append(f)
2080                     else:
2081                         media_info['formats'].extend(formats)
2082                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2083                     track_attributes = extract_attributes(track_tag)
2084                     kind = track_attributes.get('kind')
2085                     if not kind or kind in ('subtitles', 'captions'):
2086                         src = track_attributes.get('src')
2087                         if not src:
2088                             continue
2089                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2090                         media_info['subtitles'].setdefault(lang, []).append({
2091                             'url': absolute_url(src),
2092                         })
2093             if media_info['formats'] or media_info['subtitles']:
2094                 entries.append(media_info)
2095         return entries
2096
2097     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2098         formats = []
2099         hdcore_sign = 'hdcore=3.7.0'
2100         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2101         hds_host = hosts.get('hds')
2102         if hds_host:
2103             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2104         if 'hdcore=' not in f4m_url:
2105             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2106         f4m_formats = self._extract_f4m_formats(
2107             f4m_url, video_id, f4m_id='hds', fatal=False)
2108         for entry in f4m_formats:
2109             entry.update({'extra_param_to_segment_url': hdcore_sign})
2110         formats.extend(f4m_formats)
2111         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2112         hls_host = hosts.get('hls')
2113         if hls_host:
2114             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2115         formats.extend(self._extract_m3u8_formats(
2116             m3u8_url, video_id, 'mp4', 'm3u8_native',
2117             m3u8_id='hls', fatal=False))
2118         return formats
2119
2120     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2121         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2122         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2123         http_base_url = 'http' + url_base
2124         formats = []
2125         if 'm3u8' not in skip_protocols:
2126             formats.extend(self._extract_m3u8_formats(
2127                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2128                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2129         if 'f4m' not in skip_protocols:
2130             formats.extend(self._extract_f4m_formats(
2131                 http_base_url + '/manifest.f4m',
2132                 video_id, f4m_id='hds', fatal=False))
2133         if 'dash' not in skip_protocols:
2134             formats.extend(self._extract_mpd_formats(
2135                 http_base_url + '/manifest.mpd',
2136                 video_id, mpd_id='dash', fatal=False))
2137         if re.search(r'(?:/smil:|\.smil)', url_base):
2138             if 'smil' not in skip_protocols:
2139                 rtmp_formats = self._extract_smil_formats(
2140                     http_base_url + '/jwplayer.smil',
2141                     video_id, fatal=False)
2142                 for rtmp_format in rtmp_formats:
2143                     rtsp_format = rtmp_format.copy()
2144                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2145                     del rtsp_format['play_path']
2146                     del rtsp_format['ext']
2147                     rtsp_format.update({
2148                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2149                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2150                         'protocol': 'rtsp',
2151                     })
2152                     formats.extend([rtmp_format, rtsp_format])
2153         else:
2154             for protocol in ('rtmp', 'rtsp'):
2155                 if protocol not in skip_protocols:
2156                     formats.append({
2157                         'url': protocol + url_base,
2158                         'format_id': protocol,
2159                         'protocol': protocol,
2160                     })
2161         return formats
2162
2163     @staticmethod
2164     def _find_jwplayer_data(webpage):
2165         mobj = re.search(
2166             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2167             webpage)
2168         if mobj:
2169             return mobj.group('options')
2170
2171     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2172         jwplayer_data = self._parse_json(
2173             self._find_jwplayer_data(webpage), video_id,
2174             transform_source=js_to_json)
2175         return self._parse_jwplayer_data(
2176             jwplayer_data, video_id, *args, **kwargs)
2177
2178     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2179                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2180         # JWPlayer backward compatibility: flattened playlists
2181         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2182         if 'playlist' not in jwplayer_data:
2183             jwplayer_data = {'playlist': [jwplayer_data]}
2184
2185         entries = []
2186
2187         # JWPlayer backward compatibility: single playlist item
2188         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2189         if not isinstance(jwplayer_data['playlist'], list):
2190             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2191
2192         for video_data in jwplayer_data['playlist']:
2193             # JWPlayer backward compatibility: flattened sources
2194             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2195             if 'sources' not in video_data:
2196                 video_data['sources'] = [video_data]
2197
2198             this_video_id = video_id or video_data['mediaid']
2199
2200             formats = []
2201             for source in video_data['sources']:
2202                 source_url = self._proto_relative_url(source['file'])
2203                 if base_url:
2204                     source_url = compat_urlparse.urljoin(base_url, source_url)
2205                 source_type = source.get('type') or ''
2206                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2207                 if source_type == 'hls' or ext == 'm3u8':
2208                     formats.extend(self._extract_m3u8_formats(
2209                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2210                 elif ext == 'mpd':
2211                     formats.extend(self._extract_mpd_formats(
2212                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2213                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2214                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2215                     formats.append({
2216                         'url': source_url,
2217                         'vcodec': 'none',
2218                         'ext': ext,
2219                     })
2220                 else:
2221                     height = int_or_none(source.get('height'))
2222                     if height is None:
2223                         # Often no height is provided but there is a label in
2224                         # format like 1080p.
2225                         height = int_or_none(self._search_regex(
2226                             r'^(\d{3,})[pP]$', source.get('label') or '',
2227                             'height', default=None))
2228                     a_format = {
2229                         'url': source_url,
2230                         'width': int_or_none(source.get('width')),
2231                         'height': height,
2232                         'ext': ext,
2233                     }
2234                     if source_url.startswith('rtmp'):
2235                         a_format['ext'] = 'flv'
2236
2237                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2238                         # of jwplayer.flash.swf
2239                         rtmp_url_parts = re.split(
2240                             r'((?:mp4|mp3|flv):)', source_url, 1)
2241                         if len(rtmp_url_parts) == 3:
2242                             rtmp_url, prefix, play_path = rtmp_url_parts
2243                             a_format.update({
2244                                 'url': rtmp_url,
2245                                 'play_path': prefix + play_path,
2246                             })
2247                         if rtmp_params:
2248                             a_format.update(rtmp_params)
2249                     formats.append(a_format)
2250             self._sort_formats(formats)
2251
2252             subtitles = {}
2253             tracks = video_data.get('tracks')
2254             if tracks and isinstance(tracks, list):
2255                 for track in tracks:
2256                     if track.get('kind') != 'captions':
2257                         continue
2258                     track_url = urljoin(base_url, track.get('file'))
2259                     if not track_url:
2260                         continue
2261                     subtitles.setdefault(track.get('label') or 'en', []).append({
2262                         'url': self._proto_relative_url(track_url)
2263                     })
2264
2265             entries.append({
2266                 'id': this_video_id,
2267                 'title': video_data['title'] if require_title else video_data.get('title'),
2268                 'description': video_data.get('description'),
2269                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2270                 'timestamp': int_or_none(video_data.get('pubdate')),
2271                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2272                 'subtitles': subtitles,
2273                 'formats': formats,
2274             })
2275         if len(entries) == 1:
2276             return entries[0]
2277         else:
2278             return self.playlist_result(entries)
2279
2280     def _live_title(self, name):
2281         """ Generate the title for a live video """
2282         now = datetime.datetime.now()
2283         now_str = now.strftime('%Y-%m-%d %H:%M')
2284         return name + ' ' + now_str
2285
2286     def _int(self, v, name, fatal=False, **kwargs):
2287         res = int_or_none(v, **kwargs)
2288         if 'get_attr' in kwargs:
2289             print(getattr(v, kwargs['get_attr']))
2290         if res is None:
2291             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2292             if fatal:
2293                 raise ExtractorError(msg)
2294             else:
2295                 self._downloader.report_warning(msg)
2296         return res
2297
2298     def _float(self, v, name, fatal=False, **kwargs):
2299         res = float_or_none(v, **kwargs)
2300         if res is None:
2301             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2302             if fatal:
2303                 raise ExtractorError(msg)
2304             else:
2305                 self._downloader.report_warning(msg)
2306         return res
2307
2308     def _set_cookie(self, domain, name, value, expire_time=None):
2309         cookie = compat_cookiejar.Cookie(
2310             0, name, value, None, None, domain, None,
2311             None, '/', True, False, expire_time, '', None, None, None)
2312         self._downloader.cookiejar.set_cookie(cookie)
2313
2314     def _get_cookies(self, url):
2315         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2316         req = sanitized_Request(url)
2317         self._downloader.cookiejar.add_cookie_header(req)
2318         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2319
2320     def get_testcases(self, include_onlymatching=False):
2321         t = getattr(self, '_TEST', None)
2322         if t:
2323             assert not hasattr(self, '_TESTS'), \
2324                 '%s has _TEST and _TESTS' % type(self).__name__
2325             tests = [t]
2326         else:
2327             tests = getattr(self, '_TESTS', [])
2328         for t in tests:
2329             if not include_onlymatching and t.get('only_matching', False):
2330                 continue
2331             t['name'] = type(self).__name__[:-len('IE')]
2332             yield t
2333
2334     def is_suitable(self, age_limit):
2335         """ Test whether the extractor is generally suitable for the given
2336         age limit (i.e. pornographic sites are not, all others usually are) """
2337
2338         any_restricted = False
2339         for tc in self.get_testcases(include_onlymatching=False):
2340             if tc.get('playlist', []):
2341                 tc = tc['playlist'][0]
2342             is_restricted = age_restricted(
2343                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2344             if not is_restricted:
2345                 return True
2346             any_restricted = any_restricted or is_restricted
2347         return not any_restricted
2348
2349     def extract_subtitles(self, *args, **kwargs):
2350         if (self._downloader.params.get('writesubtitles', False) or
2351                 self._downloader.params.get('listsubtitles')):
2352             return self._get_subtitles(*args, **kwargs)
2353         return {}
2354
2355     def _get_subtitles(self, *args, **kwargs):
2356         raise NotImplementedError('This method must be implemented by subclasses')
2357
2358     @staticmethod
2359     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2360         """ Merge subtitle items for one language. Items with duplicated URLs
2361         will be dropped. """
2362         list1_urls = set([item['url'] for item in subtitle_list1])
2363         ret = list(subtitle_list1)
2364         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2365         return ret
2366
2367     @classmethod
2368     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2369         """ Merge two subtitle dictionaries, language by language. """
2370         ret = dict(subtitle_dict1)
2371         for lang in subtitle_dict2:
2372             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2373         return ret
2374
2375     def extract_automatic_captions(self, *args, **kwargs):
2376         if (self._downloader.params.get('writeautomaticsub', False) or
2377                 self._downloader.params.get('listsubtitles')):
2378             return self._get_automatic_captions(*args, **kwargs)
2379         return {}
2380
2381     def _get_automatic_captions(self, *args, **kwargs):
2382         raise NotImplementedError('This method must be implemented by subclasses')
2383
2384     def mark_watched(self, *args, **kwargs):
2385         if (self._downloader.params.get('mark_watched', False) and
2386                 (self._get_login_info()[0] is not None or
2387                     self._downloader.params.get('cookiefile') is not None)):
2388             self._mark_watched(*args, **kwargs)
2389
2390     def _mark_watched(self, *args, **kwargs):
2391         raise NotImplementedError('This method must be implemented by subclasses')
2392
2393     def geo_verification_headers(self):
2394         headers = {}
2395         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2396         if geo_verification_proxy:
2397             headers['Ytdl-request-proxy'] = geo_verification_proxy
2398         return headers
2399
2400     def _generic_id(self, url):
2401         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2402
2403     def _generic_title(self, url):
2404         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2405
2406
2407 class SearchInfoExtractor(InfoExtractor):
2408     """
2409     Base class for paged search queries extractors.
2410     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2411     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2412     """
2413
2414     @classmethod
2415     def _make_valid_url(cls):
2416         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2417
2418     @classmethod
2419     def suitable(cls, url):
2420         return re.match(cls._make_valid_url(), url) is not None
2421
2422     def _real_extract(self, query):
2423         mobj = re.match(self._make_valid_url(), query)
2424         if mobj is None:
2425             raise ExtractorError('Invalid search query "%s"' % query)
2426
2427         prefix = mobj.group('prefix')
2428         query = mobj.group('query')
2429         if prefix == '':
2430             return self._get_n_results(query, 1)
2431         elif prefix == 'all':
2432             return self._get_n_results(query, self._MAX_RESULTS)
2433         else:
2434             n = int(prefix)
2435             if n <= 0:
2436                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2437             elif n > self._MAX_RESULTS:
2438                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2439                 n = self._MAX_RESULTS
2440             return self._get_n_results(query, n)
2441
2442     def _get_n_results(self, query, n):
2443         """Get a specified number of results for a query"""
2444         raise NotImplementedError('This method must be implemented by subclasses')
2445
2446     @property
2447     def SEARCH_KEY(self):
2448         return self._SEARCH_KEY