[generic] Try parsing JWPlayer embedded videos (closes #12030)
[youtube-dl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import math
14
15 from ..compat import (
16     compat_cookiejar,
17     compat_cookies,
18     compat_etree_fromstring,
19     compat_getpass,
20     compat_http_client,
21     compat_os_name,
22     compat_str,
23     compat_urllib_error,
24     compat_urllib_parse_unquote,
25     compat_urllib_parse_urlencode,
26     compat_urllib_request,
27     compat_urlparse,
28 )
29 from ..downloader.f4m import remove_encrypted_media
30 from ..utils import (
31     NO_DEFAULT,
32     age_restricted,
33     base_url,
34     bug_reports_message,
35     clean_html,
36     compiled_regex_type,
37     determine_ext,
38     error_to_compat_str,
39     ExtractorError,
40     fix_xml_ampersands,
41     float_or_none,
42     int_or_none,
43     js_to_json,
44     parse_iso8601,
45     RegexNotFoundError,
46     sanitize_filename,
47     sanitized_Request,
48     unescapeHTML,
49     unified_strdate,
50     unified_timestamp,
51     url_basename,
52     xpath_element,
53     xpath_text,
54     xpath_with_ns,
55     determine_protocol,
56     parse_duration,
57     mimetype2ext,
58     update_Request,
59     update_url_query,
60     parse_m3u8_attributes,
61     extract_attributes,
62     parse_codecs,
63     urljoin,
64 )
65
66
67 class InfoExtractor(object):
68     """Information Extractor class.
69
70     Information extractors are the classes that, given a URL, extract
71     information about the video (or videos) the URL refers to. This
72     information includes the real video URL, the video title, author and
73     others. The information is stored in a dictionary which is then
74     passed to the YoutubeDL. The YoutubeDL processes this
75     information possibly downloading the video to the file system, among
76     other possible outcomes.
77
78     The type field determines the type of the result.
79     By far the most common value (and the default if _type is missing) is
80     "video", which indicates a single video.
81
82     For a video, the dictionaries must include the following fields:
83
84     id:             Video identifier.
85     title:          Video title, unescaped.
86
87     Additionally, it must contain either a formats entry or a url one:
88
89     formats:        A list of dictionaries for each format available, ordered
90                     from worst to best quality.
91
92                     Potential fields:
93                     * url        Mandatory. The URL of the video file
94                     * manifest_url
95                                  The URL of the manifest file in case of
96                                  fragmented media (DASH, hls, hds)
97                     * ext        Will be calculated from URL if missing
98                     * format     A human-readable description of the format
99                                  ("mp4 container with h264/opus").
100                                  Calculated from the format_id, width, height.
101                                  and format_note fields if missing.
102                     * format_id  A short description of the format
103                                  ("mp4_h264_opus" or "19").
104                                 Technically optional, but strongly recommended.
105                     * format_note Additional info about the format
106                                  ("3D" or "DASH video")
107                     * width      Width of the video, if known
108                     * height     Height of the video, if known
109                     * resolution Textual description of width and height
110                     * tbr        Average bitrate of audio and video in KBit/s
111                     * abr        Average audio bitrate in KBit/s
112                     * acodec     Name of the audio codec in use
113                     * asr        Audio sampling rate in Hertz
114                     * vbr        Average video bitrate in KBit/s
115                     * fps        Frame rate
116                     * vcodec     Name of the video codec in use
117                     * container  Name of the container format
118                     * filesize   The number of bytes, if known in advance
119                     * filesize_approx  An estimate for the number of bytes
120                     * player_url SWF Player URL (used for rtmpdump).
121                     * protocol   The protocol that will be used for the actual
122                                  download, lower-case.
123                                  "http", "https", "rtsp", "rtmp", "rtmpe",
124                                  "m3u8", "m3u8_native" or "http_dash_segments".
125                     * fragment_base_url
126                                  Base URL for fragments. Each fragment's path
127                                  value (if present) will be relative to
128                                  this URL.
129                     * fragments  A list of fragments of a fragmented media.
130                                  Each fragment entry must contain either an url
131                                  or a path. If an url is present it should be
132                                  considered by a client. Otherwise both path and
133                                  fragment_base_url must be present. Here is
134                                  the list of all potential fields:
135                                  * "url" - fragment's URL
136                                  * "path" - fragment's path relative to
137                                             fragment_base_url
138                                  * "duration" (optional, int or float)
139                                  * "filesize" (optional, int)
140                     * preference Order number of this format. If this field is
141                                  present and not None, the formats get sorted
142                                  by this field, regardless of all other values.
143                                  -1 for default (order by other properties),
144                                  -2 or smaller for less than default.
145                                  < -1000 to hide the format (if there is
146                                     another one which is strictly better)
147                     * language   Language code, e.g. "de" or "en-US".
148                     * language_preference  Is this in the language mentioned in
149                                  the URL?
150                                  10 if it's what the URL is about,
151                                  -1 for default (don't know),
152                                  -10 otherwise, other values reserved for now.
153                     * quality    Order number of the video quality of this
154                                  format, irrespective of the file format.
155                                  -1 for default (order by other properties),
156                                  -2 or smaller for less than default.
157                     * source_preference  Order number for this video source
158                                   (quality takes higher priority)
159                                  -1 for default (order by other properties),
160                                  -2 or smaller for less than default.
161                     * http_headers  A dictionary of additional HTTP headers
162                                  to add to the request.
163                     * stretched_ratio  If given and not 1, indicates that the
164                                  video's pixels are not square.
165                                  width : height ratio as float.
166                     * no_resume  The server does not support resuming the
167                                  (HTTP or RTMP) download. Boolean.
168
169     url:            Final video URL.
170     ext:            Video filename extension.
171     format:         The video format, defaults to ext (used for --get-format)
172     player_url:     SWF Player URL (used for rtmpdump).
173
174     The following fields are optional:
175
176     alt_title:      A secondary title of the video.
177     display_id      An alternative identifier for the video, not necessarily
178                     unique, but available before title. Typically, id is
179                     something like "4234987", title "Dancing naked mole rats",
180                     and display_id "dancing-naked-mole-rats"
181     thumbnails:     A list of dictionaries, with the following entries:
182                         * "id" (optional, string) - Thumbnail format ID
183                         * "url"
184                         * "preference" (optional, int) - quality of the image
185                         * "width" (optional, int)
186                         * "height" (optional, int)
187                         * "resolution" (optional, string "{width}x{height"},
188                                         deprecated)
189                         * "filesize" (optional, int)
190     thumbnail:      Full URL to a video thumbnail image.
191     description:    Full video description.
192     uploader:       Full name of the video uploader.
193     license:        License name the video is licensed under.
194     creator:        The creator of the video.
195     release_date:   The date (YYYYMMDD) when the video was released.
196     timestamp:      UNIX timestamp of the moment the video became available.
197     upload_date:    Video upload date (YYYYMMDD).
198                     If not explicitly set, calculated from timestamp.
199     uploader_id:    Nickname or id of the video uploader.
200     uploader_url:   Full URL to a personal webpage of the video uploader.
201     location:       Physical location where the video was filmed.
202     subtitles:      The available subtitles as a dictionary in the format
203                     {tag: subformats}. "tag" is usually a language code, and
204                     "subformats" is a list sorted from lower to higher
205                     preference, each element is a dictionary with the "ext"
206                     entry and one of:
207                         * "data": The subtitles file contents
208                         * "url": A URL pointing to the subtitles file
209                     "ext" will be calculated from URL if missing
210     automatic_captions: Like 'subtitles', used by the YoutubeIE for
211                     automatically generated captions
212     duration:       Length of the video in seconds, as an integer or float.
213     view_count:     How many users have watched the video on the platform.
214     like_count:     Number of positive ratings of the video
215     dislike_count:  Number of negative ratings of the video
216     repost_count:   Number of reposts of the video
217     average_rating: Average rating give by users, the scale used depends on the webpage
218     comment_count:  Number of comments on the video
219     comments:       A list of comments, each with one or more of the following
220                     properties (all but one of text or html optional):
221                         * "author" - human-readable name of the comment author
222                         * "author_id" - user ID of the comment author
223                         * "id" - Comment ID
224                         * "html" - Comment as HTML
225                         * "text" - Plain text of the comment
226                         * "timestamp" - UNIX timestamp of comment
227                         * "parent" - ID of the comment this one is replying to.
228                                      Set to "root" to indicate that this is a
229                                      comment to the original video.
230     age_limit:      Age restriction for the video, as an integer (years)
231     webpage_url:    The URL to the video webpage, if given to youtube-dl it
232                     should allow to get the same result again. (It will be set
233                     by YoutubeDL if it's missing)
234     categories:     A list of categories that the video falls in, for example
235                     ["Sports", "Berlin"]
236     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
237     is_live:        True, False, or None (=unknown). Whether this video is a
238                     live stream that goes on instead of a fixed-length video.
239     start_time:     Time in seconds where the reproduction should start, as
240                     specified in the URL.
241     end_time:       Time in seconds where the reproduction should end, as
242                     specified in the URL.
243
244     The following fields should only be used when the video belongs to some logical
245     chapter or section:
246
247     chapter:        Name or title of the chapter the video belongs to.
248     chapter_number: Number of the chapter the video belongs to, as an integer.
249     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
250
251     The following fields should only be used when the video is an episode of some
252     series, programme or podcast:
253
254     series:         Title of the series or programme the video episode belongs to.
255     season:         Title of the season the video episode belongs to.
256     season_number:  Number of the season the video episode belongs to, as an integer.
257     season_id:      Id of the season the video episode belongs to, as a unicode string.
258     episode:        Title of the video episode. Unlike mandatory video title field,
259                     this field should denote the exact title of the video episode
260                     without any kind of decoration.
261     episode_number: Number of the video episode within a season, as an integer.
262     episode_id:     Id of the video episode, as a unicode string.
263
264     The following fields should only be used when the media is a track or a part of
265     a music album:
266
267     track:          Title of the track.
268     track_number:   Number of the track within an album or a disc, as an integer.
269     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
270                     as a unicode string.
271     artist:         Artist(s) of the track.
272     genre:          Genre(s) of the track.
273     album:          Title of the album the track belongs to.
274     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
275     album_artist:   List of all artists appeared on the album (e.g.
276                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
277                     and compilations).
278     disc_number:    Number of the disc or other physical medium the track belongs to,
279                     as an integer.
280     release_year:   Year (YYYY) when the album was released.
281
282     Unless mentioned otherwise, the fields should be Unicode strings.
283
284     Unless mentioned otherwise, None is equivalent to absence of information.
285
286
287     _type "playlist" indicates multiple videos.
288     There must be a key "entries", which is a list, an iterable, or a PagedList
289     object, each element of which is a valid dictionary by this specification.
290
291     Additionally, playlists can have "title", "description" and "id" attributes
292     with the same semantics as videos (see above).
293
294
295     _type "multi_video" indicates that there are multiple videos that
296     form a single show, for examples multiple acts of an opera or TV episode.
297     It must have an entries key like a playlist and contain all the keys
298     required for a video at the same time.
299
300
301     _type "url" indicates that the video must be extracted from another
302     location, possibly by a different extractor. Its only required key is:
303     "url" - the next URL to extract.
304     The key "ie_key" can be set to the class name (minus the trailing "IE",
305     e.g. "Youtube") if the extractor class is known in advance.
306     Additionally, the dictionary may have any properties of the resolved entity
307     known in advance, for example "title" if the title of the referred video is
308     known ahead of time.
309
310
311     _type "url_transparent" entities have the same specification as "url", but
312     indicate that the given additional information is more precise than the one
313     associated with the resolved URL.
314     This is useful when a site employs a video service that hosts the video and
315     its technical metadata, but that video service does not embed a useful
316     title, description etc.
317
318
319     Subclasses of this one should re-define the _real_initialize() and
320     _real_extract() methods and define a _VALID_URL regexp.
321     Probably, they should also be added to the list of extractors.
322
323     Finally, the _WORKING attribute should be set to False for broken IEs
324     in order to warn the users and skip the tests.
325     """
326
327     _ready = False
328     _downloader = None
329     _WORKING = True
330
331     def __init__(self, downloader=None):
332         """Constructor. Receives an optional downloader."""
333         self._ready = False
334         self.set_downloader(downloader)
335
336     @classmethod
337     def suitable(cls, url):
338         """Receives a URL and returns True if suitable for this IE."""
339
340         # This does not use has/getattr intentionally - we want to know whether
341         # we have cached the regexp for *this* class, whereas getattr would also
342         # match the superclass
343         if '_VALID_URL_RE' not in cls.__dict__:
344             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
345         return cls._VALID_URL_RE.match(url) is not None
346
347     @classmethod
348     def _match_id(cls, url):
349         if '_VALID_URL_RE' not in cls.__dict__:
350             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
351         m = cls._VALID_URL_RE.match(url)
352         assert m
353         return m.group('id')
354
355     @classmethod
356     def working(cls):
357         """Getter method for _WORKING."""
358         return cls._WORKING
359
360     def initialize(self):
361         """Initializes an instance (authentication, etc)."""
362         if not self._ready:
363             self._real_initialize()
364             self._ready = True
365
366     def extract(self, url):
367         """Extracts URL information and returns it in list of dicts."""
368         try:
369             self.initialize()
370             return self._real_extract(url)
371         except ExtractorError:
372             raise
373         except compat_http_client.IncompleteRead as e:
374             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
375         except (KeyError, StopIteration) as e:
376             raise ExtractorError('An extractor error has occurred.', cause=e)
377
378     def set_downloader(self, downloader):
379         """Sets the downloader for this IE."""
380         self._downloader = downloader
381
382     def _real_initialize(self):
383         """Real initialization process. Redefine in subclasses."""
384         pass
385
386     def _real_extract(self, url):
387         """Real extraction process. Redefine in subclasses."""
388         pass
389
390     @classmethod
391     def ie_key(cls):
392         """A string for getting the InfoExtractor with get_info_extractor"""
393         return compat_str(cls.__name__[:-2])
394
395     @property
396     def IE_NAME(self):
397         return compat_str(type(self).__name__[:-2])
398
399     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
400         """ Returns the response handle """
401         if note is None:
402             self.report_download_webpage(video_id)
403         elif note is not False:
404             if video_id is None:
405                 self.to_screen('%s' % (note,))
406             else:
407                 self.to_screen('%s: %s' % (video_id, note))
408         if isinstance(url_or_request, compat_urllib_request.Request):
409             url_or_request = update_Request(
410                 url_or_request, data=data, headers=headers, query=query)
411         else:
412             if query:
413                 url_or_request = update_url_query(url_or_request, query)
414             if data is not None or headers:
415                 url_or_request = sanitized_Request(url_or_request, data, headers)
416         try:
417             return self._downloader.urlopen(url_or_request)
418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
419             if errnote is False:
420                 return False
421             if errnote is None:
422                 errnote = 'Unable to download webpage'
423
424             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
425             if fatal:
426                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
427             else:
428                 self._downloader.report_warning(errmsg)
429                 return False
430
431     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
432         """ Returns a tuple (page content as string, URL handle) """
433         # Strip hashes from the URL (#1038)
434         if isinstance(url_or_request, (compat_str, str)):
435             url_or_request = url_or_request.partition('#')[0]
436
437         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
438         if urlh is False:
439             assert not fatal
440             return False
441         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
442         return (content, urlh)
443
444     @staticmethod
445     def _guess_encoding_from_content(content_type, webpage_bytes):
446         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
447         if m:
448             encoding = m.group(1)
449         else:
450             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
451                           webpage_bytes[:1024])
452             if m:
453                 encoding = m.group(1).decode('ascii')
454             elif webpage_bytes.startswith(b'\xff\xfe'):
455                 encoding = 'utf-16'
456             else:
457                 encoding = 'utf-8'
458
459         return encoding
460
461     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
462         content_type = urlh.headers.get('Content-Type', '')
463         webpage_bytes = urlh.read()
464         if prefix is not None:
465             webpage_bytes = prefix + webpage_bytes
466         if not encoding:
467             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
468         if self._downloader.params.get('dump_intermediate_pages', False):
469             try:
470                 url = url_or_request.get_full_url()
471             except AttributeError:
472                 url = url_or_request
473             self.to_screen('Dumping request to ' + url)
474             dump = base64.b64encode(webpage_bytes).decode('ascii')
475             self._downloader.to_screen(dump)
476         if self._downloader.params.get('write_pages', False):
477             try:
478                 url = url_or_request.get_full_url()
479             except AttributeError:
480                 url = url_or_request
481             basen = '%s_%s' % (video_id, url)
482             if len(basen) > 240:
483                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
484                 basen = basen[:240 - len(h)] + h
485             raw_filename = basen + '.dump'
486             filename = sanitize_filename(raw_filename, restricted=True)
487             self.to_screen('Saving request to ' + filename)
488             # Working around MAX_PATH limitation on Windows (see
489             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
490             if compat_os_name == 'nt':
491                 absfilepath = os.path.abspath(filename)
492                 if len(absfilepath) > 259:
493                     filename = '\\\\?\\' + absfilepath
494             with open(filename, 'wb') as outf:
495                 outf.write(webpage_bytes)
496
497         try:
498             content = webpage_bytes.decode(encoding, 'replace')
499         except LookupError:
500             content = webpage_bytes.decode('utf-8', 'replace')
501
502         if ('<title>Access to this site is blocked</title>' in content and
503                 'Websense' in content[:512]):
504             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
505             blocked_iframe = self._html_search_regex(
506                 r'<iframe src="([^"]+)"', content,
507                 'Websense information URL', default=None)
508             if blocked_iframe:
509                 msg += ' Visit %s for more details' % blocked_iframe
510             raise ExtractorError(msg, expected=True)
511         if '<title>The URL you requested has been blocked</title>' in content[:512]:
512             msg = (
513                 'Access to this webpage has been blocked by Indian censorship. '
514                 'Use a VPN or proxy server (with --proxy) to route around it.')
515             block_msg = self._html_search_regex(
516                 r'</h1><p>(.*?)</p>',
517                 content, 'block message', default=None)
518             if block_msg:
519                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
520             raise ExtractorError(msg, expected=True)
521
522         return content
523
524     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
525         """ Returns the data of the page as a string """
526         success = False
527         try_count = 0
528         while success is False:
529             try:
530                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
531                 success = True
532             except compat_http_client.IncompleteRead as e:
533                 try_count += 1
534                 if try_count >= tries:
535                     raise e
536                 self._sleep(timeout, video_id)
537         if res is False:
538             return res
539         else:
540             content, _ = res
541             return content
542
543     def _download_xml(self, url_or_request, video_id,
544                       note='Downloading XML', errnote='Unable to download XML',
545                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
546         """Return the xml as an xml.etree.ElementTree.Element"""
547         xml_string = self._download_webpage(
548             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
549         if xml_string is False:
550             return xml_string
551         if transform_source:
552             xml_string = transform_source(xml_string)
553         return compat_etree_fromstring(xml_string.encode('utf-8'))
554
555     def _download_json(self, url_or_request, video_id,
556                        note='Downloading JSON metadata',
557                        errnote='Unable to download JSON metadata',
558                        transform_source=None,
559                        fatal=True, encoding=None, data=None, headers={}, query={}):
560         json_string = self._download_webpage(
561             url_or_request, video_id, note, errnote, fatal=fatal,
562             encoding=encoding, data=data, headers=headers, query=query)
563         if (not fatal) and json_string is False:
564             return None
565         return self._parse_json(
566             json_string, video_id, transform_source=transform_source, fatal=fatal)
567
568     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
569         if transform_source:
570             json_string = transform_source(json_string)
571         try:
572             return json.loads(json_string)
573         except ValueError as ve:
574             errmsg = '%s: Failed to parse JSON ' % video_id
575             if fatal:
576                 raise ExtractorError(errmsg, cause=ve)
577             else:
578                 self.report_warning(errmsg + str(ve))
579
580     def report_warning(self, msg, video_id=None):
581         idstr = '' if video_id is None else '%s: ' % video_id
582         self._downloader.report_warning(
583             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
584
585     def to_screen(self, msg):
586         """Print msg to screen, prefixing it with '[ie_name]'"""
587         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
588
589     def report_extraction(self, id_or_name):
590         """Report information extraction."""
591         self.to_screen('%s: Extracting information' % id_or_name)
592
593     def report_download_webpage(self, video_id):
594         """Report webpage download."""
595         self.to_screen('%s: Downloading webpage' % video_id)
596
597     def report_age_confirmation(self):
598         """Report attempt to confirm age."""
599         self.to_screen('Confirming age')
600
601     def report_login(self):
602         """Report attempt to log in."""
603         self.to_screen('Logging in')
604
605     @staticmethod
606     def raise_login_required(msg='This video is only available for registered users'):
607         raise ExtractorError(
608             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
609             expected=True)
610
611     @staticmethod
612     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
613         raise ExtractorError(
614             '%s. You might want to use --proxy to workaround.' % msg,
615             expected=True)
616
617     # Methods for following #608
618     @staticmethod
619     def url_result(url, ie=None, video_id=None, video_title=None):
620         """Returns a URL that points to a page that should be processed"""
621         # TODO: ie should be the class used for getting the info
622         video_info = {'_type': 'url',
623                       'url': url,
624                       'ie_key': ie}
625         if video_id is not None:
626             video_info['id'] = video_id
627         if video_title is not None:
628             video_info['title'] = video_title
629         return video_info
630
631     @staticmethod
632     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
633         """Returns a playlist"""
634         video_info = {'_type': 'playlist',
635                       'entries': entries}
636         if playlist_id:
637             video_info['id'] = playlist_id
638         if playlist_title:
639             video_info['title'] = playlist_title
640         if playlist_description:
641             video_info['description'] = playlist_description
642         return video_info
643
644     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
645         """
646         Perform a regex search on the given string, using a single or a list of
647         patterns returning the first matching group.
648         In case of failure return a default value or raise a WARNING or a
649         RegexNotFoundError, depending on fatal, specifying the field name.
650         """
651         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
652             mobj = re.search(pattern, string, flags)
653         else:
654             for p in pattern:
655                 mobj = re.search(p, string, flags)
656                 if mobj:
657                     break
658
659         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
660             _name = '\033[0;34m%s\033[0m' % name
661         else:
662             _name = name
663
664         if mobj:
665             if group is None:
666                 # return the first matching group
667                 return next(g for g in mobj.groups() if g is not None)
668             else:
669                 return mobj.group(group)
670         elif default is not NO_DEFAULT:
671             return default
672         elif fatal:
673             raise RegexNotFoundError('Unable to extract %s' % _name)
674         else:
675             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
676             return None
677
678     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
679         """
680         Like _search_regex, but strips HTML tags and unescapes entities.
681         """
682         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
683         if res:
684             return clean_html(res).strip()
685         else:
686             return res
687
688     def _get_netrc_login_info(self, netrc_machine=None):
689         username = None
690         password = None
691         netrc_machine = netrc_machine or self._NETRC_MACHINE
692
693         if self._downloader.params.get('usenetrc', False):
694             try:
695                 info = netrc.netrc().authenticators(netrc_machine)
696                 if info is not None:
697                     username = info[0]
698                     password = info[2]
699                 else:
700                     raise netrc.NetrcParseError(
701                         'No authenticators for %s' % netrc_machine)
702             except (IOError, netrc.NetrcParseError) as err:
703                 self._downloader.report_warning(
704                     'parsing .netrc: %s' % error_to_compat_str(err))
705
706         return username, password
707
708     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
709         """
710         Get the login info as (username, password)
711         First look for the manually specified credentials using username_option
712         and password_option as keys in params dictionary. If no such credentials
713         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
714         value.
715         If there's no info available, return (None, None)
716         """
717         if self._downloader is None:
718             return (None, None)
719
720         downloader_params = self._downloader.params
721
722         # Attempt to use provided username and password or .netrc data
723         if downloader_params.get(username_option) is not None:
724             username = downloader_params[username_option]
725             password = downloader_params[password_option]
726         else:
727             username, password = self._get_netrc_login_info(netrc_machine)
728
729         return username, password
730
731     def _get_tfa_info(self, note='two-factor verification code'):
732         """
733         Get the two-factor authentication info
734         TODO - asking the user will be required for sms/phone verify
735         currently just uses the command line option
736         If there's no info available, return None
737         """
738         if self._downloader is None:
739             return None
740         downloader_params = self._downloader.params
741
742         if downloader_params.get('twofactor') is not None:
743             return downloader_params['twofactor']
744
745         return compat_getpass('Type %s and press [Return]: ' % note)
746
747     # Helper functions for extracting OpenGraph info
748     @staticmethod
749     def _og_regexes(prop):
750         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
751         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
752                        % {'prop': re.escape(prop)})
753         template = r'<meta[^>]+?%s[^>]+?%s'
754         return [
755             template % (property_re, content_re),
756             template % (content_re, property_re),
757         ]
758
759     @staticmethod
760     def _meta_regex(prop):
761         return r'''(?isx)<meta
762                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
763                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
764
765     def _og_search_property(self, prop, html, name=None, **kargs):
766         if not isinstance(prop, (list, tuple)):
767             prop = [prop]
768         if name is None:
769             name = 'OpenGraph %s' % prop[0]
770         og_regexes = []
771         for p in prop:
772             og_regexes.extend(self._og_regexes(p))
773         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
774         if escaped is None:
775             return None
776         return unescapeHTML(escaped)
777
778     def _og_search_thumbnail(self, html, **kargs):
779         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
780
781     def _og_search_description(self, html, **kargs):
782         return self._og_search_property('description', html, fatal=False, **kargs)
783
784     def _og_search_title(self, html, **kargs):
785         return self._og_search_property('title', html, **kargs)
786
787     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
788         regexes = self._og_regexes('video') + self._og_regexes('video:url')
789         if secure:
790             regexes = self._og_regexes('video:secure_url') + regexes
791         return self._html_search_regex(regexes, html, name, **kargs)
792
793     def _og_search_url(self, html, **kargs):
794         return self._og_search_property('url', html, **kargs)
795
796     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
797         if not isinstance(name, (list, tuple)):
798             name = [name]
799         if display_name is None:
800             display_name = name[0]
801         return self._html_search_regex(
802             [self._meta_regex(n) for n in name],
803             html, display_name, fatal=fatal, group='content', **kwargs)
804
805     def _dc_search_uploader(self, html):
806         return self._html_search_meta('dc.creator', html, 'uploader')
807
808     def _rta_search(self, html):
809         # See http://www.rtalabel.org/index.php?content=howtofaq#single
810         if re.search(r'(?ix)<meta\s+name="rating"\s+'
811                      r'     content="RTA-5042-1996-1400-1577-RTA"',
812                      html):
813             return 18
814         return 0
815
816     def _media_rating_search(self, html):
817         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
818         rating = self._html_search_meta('rating', html)
819
820         if not rating:
821             return None
822
823         RATING_TABLE = {
824             'safe for kids': 0,
825             'general': 8,
826             '14 years': 14,
827             'mature': 17,
828             'restricted': 19,
829         }
830         return RATING_TABLE.get(rating.lower())
831
832     def _family_friendly_search(self, html):
833         # See http://schema.org/VideoObject
834         family_friendly = self._html_search_meta('isFamilyFriendly', html)
835
836         if not family_friendly:
837             return None
838
839         RATING_TABLE = {
840             '1': 0,
841             'true': 0,
842             '0': 18,
843             'false': 18,
844         }
845         return RATING_TABLE.get(family_friendly.lower())
846
847     def _twitter_search_player(self, html):
848         return self._html_search_meta('twitter:player', html,
849                                       'twitter card player')
850
851     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
852         json_ld = self._search_regex(
853             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
854             html, 'JSON-LD', group='json_ld', **kwargs)
855         default = kwargs.get('default', NO_DEFAULT)
856         if not json_ld:
857             return default if default is not NO_DEFAULT else {}
858         # JSON-LD may be malformed and thus `fatal` should be respected.
859         # At the same time `default` may be passed that assumes `fatal=False`
860         # for _search_regex. Let's simulate the same behavior here as well.
861         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
862         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
863
864     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
865         if isinstance(json_ld, compat_str):
866             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
867         if not json_ld:
868             return {}
869         info = {}
870         if not isinstance(json_ld, (list, tuple, dict)):
871             return info
872         if isinstance(json_ld, dict):
873             json_ld = [json_ld]
874         for e in json_ld:
875             if e.get('@context') == 'http://schema.org':
876                 item_type = e.get('@type')
877                 if expected_type is not None and expected_type != item_type:
878                     return info
879                 if item_type == 'TVEpisode':
880                     info.update({
881                         'episode': unescapeHTML(e.get('name')),
882                         'episode_number': int_or_none(e.get('episodeNumber')),
883                         'description': unescapeHTML(e.get('description')),
884                     })
885                     part_of_season = e.get('partOfSeason')
886                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
887                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
888                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
889                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
890                         info['series'] = unescapeHTML(part_of_series.get('name'))
891                 elif item_type == 'Article':
892                     info.update({
893                         'timestamp': parse_iso8601(e.get('datePublished')),
894                         'title': unescapeHTML(e.get('headline')),
895                         'description': unescapeHTML(e.get('articleBody')),
896                     })
897                 elif item_type == 'VideoObject':
898                     info.update({
899                         'url': e.get('contentUrl'),
900                         'title': unescapeHTML(e.get('name')),
901                         'description': unescapeHTML(e.get('description')),
902                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
903                         'duration': parse_duration(e.get('duration')),
904                         'timestamp': unified_timestamp(e.get('uploadDate')),
905                         'filesize': float_or_none(e.get('contentSize')),
906                         'tbr': int_or_none(e.get('bitrate')),
907                         'width': int_or_none(e.get('width')),
908                         'height': int_or_none(e.get('height')),
909                     })
910                 break
911         return dict((k, v) for k, v in info.items() if v is not None)
912
913     @staticmethod
914     def _hidden_inputs(html):
915         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
916         hidden_inputs = {}
917         for input in re.findall(r'(?i)(<input[^>]+>)', html):
918             attrs = extract_attributes(input)
919             if not input:
920                 continue
921             if attrs.get('type') not in ('hidden', 'submit'):
922                 continue
923             name = attrs.get('name') or attrs.get('id')
924             value = attrs.get('value')
925             if name and value is not None:
926                 hidden_inputs[name] = value
927         return hidden_inputs
928
929     def _form_hidden_inputs(self, form_id, html):
930         form = self._search_regex(
931             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
932             html, '%s form' % form_id, group='form')
933         return self._hidden_inputs(form)
934
935     def _sort_formats(self, formats, field_preference=None):
936         if not formats:
937             raise ExtractorError('No video formats found')
938
939         for f in formats:
940             # Automatically determine tbr when missing based on abr and vbr (improves
941             # formats sorting in some cases)
942             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
943                 f['tbr'] = f['abr'] + f['vbr']
944
945         def _formats_key(f):
946             # TODO remove the following workaround
947             from ..utils import determine_ext
948             if not f.get('ext') and 'url' in f:
949                 f['ext'] = determine_ext(f['url'])
950
951             if isinstance(field_preference, (list, tuple)):
952                 return tuple(
953                     f.get(field)
954                     if f.get(field) is not None
955                     else ('' if field == 'format_id' else -1)
956                     for field in field_preference)
957
958             preference = f.get('preference')
959             if preference is None:
960                 preference = 0
961                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
962                     preference -= 0.5
963
964             protocol = f.get('protocol') or determine_protocol(f)
965             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
966
967             if f.get('vcodec') == 'none':  # audio only
968                 preference -= 50
969                 if self._downloader.params.get('prefer_free_formats'):
970                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
971                 else:
972                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
973                 ext_preference = 0
974                 try:
975                     audio_ext_preference = ORDER.index(f['ext'])
976                 except ValueError:
977                     audio_ext_preference = -1
978             else:
979                 if f.get('acodec') == 'none':  # video only
980                     preference -= 40
981                 if self._downloader.params.get('prefer_free_formats'):
982                     ORDER = ['flv', 'mp4', 'webm']
983                 else:
984                     ORDER = ['webm', 'flv', 'mp4']
985                 try:
986                     ext_preference = ORDER.index(f['ext'])
987                 except ValueError:
988                     ext_preference = -1
989                 audio_ext_preference = 0
990
991             return (
992                 preference,
993                 f.get('language_preference') if f.get('language_preference') is not None else -1,
994                 f.get('quality') if f.get('quality') is not None else -1,
995                 f.get('tbr') if f.get('tbr') is not None else -1,
996                 f.get('filesize') if f.get('filesize') is not None else -1,
997                 f.get('vbr') if f.get('vbr') is not None else -1,
998                 f.get('height') if f.get('height') is not None else -1,
999                 f.get('width') if f.get('width') is not None else -1,
1000                 proto_preference,
1001                 ext_preference,
1002                 f.get('abr') if f.get('abr') is not None else -1,
1003                 audio_ext_preference,
1004                 f.get('fps') if f.get('fps') is not None else -1,
1005                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1006                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1007                 f.get('format_id') if f.get('format_id') is not None else '',
1008             )
1009         formats.sort(key=_formats_key)
1010
1011     def _check_formats(self, formats, video_id):
1012         if formats:
1013             formats[:] = filter(
1014                 lambda f: self._is_valid_url(
1015                     f['url'], video_id,
1016                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1017                 formats)
1018
1019     @staticmethod
1020     def _remove_duplicate_formats(formats):
1021         format_urls = set()
1022         unique_formats = []
1023         for f in formats:
1024             if f['url'] not in format_urls:
1025                 format_urls.add(f['url'])
1026                 unique_formats.append(f)
1027         formats[:] = unique_formats
1028
1029     def _is_valid_url(self, url, video_id, item='video', headers={}):
1030         url = self._proto_relative_url(url, scheme='http:')
1031         # For now assume non HTTP(S) URLs always valid
1032         if not (url.startswith('http://') or url.startswith('https://')):
1033             return True
1034         try:
1035             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1036             return True
1037         except ExtractorError as e:
1038             if isinstance(e.cause, compat_urllib_error.URLError):
1039                 self.to_screen(
1040                     '%s: %s URL is invalid, skipping' % (video_id, item))
1041                 return False
1042             raise
1043
1044     def http_scheme(self):
1045         """ Either "http:" or "https:", depending on the user's preferences """
1046         return (
1047             'http:'
1048             if self._downloader.params.get('prefer_insecure', False)
1049             else 'https:')
1050
1051     def _proto_relative_url(self, url, scheme=None):
1052         if url is None:
1053             return url
1054         if url.startswith('//'):
1055             if scheme is None:
1056                 scheme = self.http_scheme()
1057             return scheme + url
1058         else:
1059             return url
1060
1061     def _sleep(self, timeout, video_id, msg_template=None):
1062         if msg_template is None:
1063             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1064         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1065         self.to_screen(msg)
1066         time.sleep(timeout)
1067
1068     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1069                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1070                              fatal=True, m3u8_id=None):
1071         manifest = self._download_xml(
1072             manifest_url, video_id, 'Downloading f4m manifest',
1073             'Unable to download f4m manifest',
1074             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1075             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1076             transform_source=transform_source,
1077             fatal=fatal)
1078
1079         if manifest is False:
1080             return []
1081
1082         return self._parse_f4m_formats(
1083             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1084             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1085
1086     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1087                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1088                            fatal=True, m3u8_id=None):
1089         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1090         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1091         if akamai_pv is not None and ';' in akamai_pv.text:
1092             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1093             if playerVerificationChallenge.strip() != '':
1094                 return []
1095
1096         formats = []
1097         manifest_version = '1.0'
1098         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1099         if not media_nodes:
1100             manifest_version = '2.0'
1101             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1102         # Remove unsupported DRM protected media from final formats
1103         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1104         media_nodes = remove_encrypted_media(media_nodes)
1105         if not media_nodes:
1106             return formats
1107         base_url = xpath_text(
1108             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1109             'base URL', default=None)
1110         if base_url:
1111             base_url = base_url.strip()
1112
1113         bootstrap_info = xpath_element(
1114             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1115             'bootstrap info', default=None)
1116
1117         vcodec = None
1118         mime_type = xpath_text(
1119             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1120             'base URL', default=None)
1121         if mime_type and mime_type.startswith('audio/'):
1122             vcodec = 'none'
1123
1124         for i, media_el in enumerate(media_nodes):
1125             tbr = int_or_none(media_el.attrib.get('bitrate'))
1126             width = int_or_none(media_el.attrib.get('width'))
1127             height = int_or_none(media_el.attrib.get('height'))
1128             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1129             # If <bootstrapInfo> is present, the specified f4m is a
1130             # stream-level manifest, and only set-level manifests may refer to
1131             # external resources.  See section 11.4 and section 4 of F4M spec
1132             if bootstrap_info is None:
1133                 media_url = None
1134                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1135                 if manifest_version == '2.0':
1136                     media_url = media_el.attrib.get('href')
1137                 if media_url is None:
1138                     media_url = media_el.attrib.get('url')
1139                 if not media_url:
1140                     continue
1141                 manifest_url = (
1142                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1143                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1144                 # If media_url is itself a f4m manifest do the recursive extraction
1145                 # since bitrates in parent manifest (this one) and media_url manifest
1146                 # may differ leading to inability to resolve the format by requested
1147                 # bitrate in f4m downloader
1148                 ext = determine_ext(manifest_url)
1149                 if ext == 'f4m':
1150                     f4m_formats = self._extract_f4m_formats(
1151                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1152                         transform_source=transform_source, fatal=fatal)
1153                     # Sometimes stream-level manifest contains single media entry that
1154                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1155                     # At the same time parent's media entry in set-level manifest may
1156                     # contain it. We will copy it from parent in such cases.
1157                     if len(f4m_formats) == 1:
1158                         f = f4m_formats[0]
1159                         f.update({
1160                             'tbr': f.get('tbr') or tbr,
1161                             'width': f.get('width') or width,
1162                             'height': f.get('height') or height,
1163                             'format_id': f.get('format_id') if not tbr else format_id,
1164                             'vcodec': vcodec,
1165                         })
1166                     formats.extend(f4m_formats)
1167                     continue
1168                 elif ext == 'm3u8':
1169                     formats.extend(self._extract_m3u8_formats(
1170                         manifest_url, video_id, 'mp4', preference=preference,
1171                         m3u8_id=m3u8_id, fatal=fatal))
1172                     continue
1173             formats.append({
1174                 'format_id': format_id,
1175                 'url': manifest_url,
1176                 'manifest_url': manifest_url,
1177                 'ext': 'flv' if bootstrap_info is not None else None,
1178                 'tbr': tbr,
1179                 'width': width,
1180                 'height': height,
1181                 'vcodec': vcodec,
1182                 'preference': preference,
1183             })
1184         return formats
1185
1186     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1187         return {
1188             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1189             'url': m3u8_url,
1190             'ext': ext,
1191             'protocol': 'm3u8',
1192             'preference': preference - 100 if preference else -100,
1193             'resolution': 'multiple',
1194             'format_note': 'Quality selection URL',
1195         }
1196
1197     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1198                               entry_protocol='m3u8', preference=None,
1199                               m3u8_id=None, note=None, errnote=None,
1200                               fatal=True, live=False):
1201
1202         res = self._download_webpage_handle(
1203             m3u8_url, video_id,
1204             note=note or 'Downloading m3u8 information',
1205             errnote=errnote or 'Failed to download m3u8 information',
1206             fatal=fatal)
1207         if res is False:
1208             return []
1209         m3u8_doc, urlh = res
1210         m3u8_url = urlh.geturl()
1211
1212         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1213             return []
1214
1215         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1216
1217         format_url = lambda u: (
1218             u
1219             if re.match(r'^https?://', u)
1220             else compat_urlparse.urljoin(m3u8_url, u))
1221
1222         # We should try extracting formats only from master playlists [1], i.e.
1223         # playlists that describe available qualities. On the other hand media
1224         # playlists [2] should be returned as is since they contain just the media
1225         # without qualities renditions.
1226         # Fortunately, master playlist can be easily distinguished from media
1227         # playlist based on particular tags availability. As of [1, 2] master
1228         # playlist tags MUST NOT appear in a media playist and vice versa.
1229         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1230         # and MUST NOT appear in master playlist thus we can clearly detect media
1231         # playlist with this criterion.
1232         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1233         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1234         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1235         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1236             return [{
1237                 'url': m3u8_url,
1238                 'format_id': m3u8_id,
1239                 'ext': ext,
1240                 'protocol': entry_protocol,
1241                 'preference': preference,
1242             }]
1243         audio_in_video_stream = {}
1244         last_info = {}
1245         last_media = {}
1246         for line in m3u8_doc.splitlines():
1247             if line.startswith('#EXT-X-STREAM-INF:'):
1248                 last_info = parse_m3u8_attributes(line)
1249             elif line.startswith('#EXT-X-MEDIA:'):
1250                 media = parse_m3u8_attributes(line)
1251                 media_type = media.get('TYPE')
1252                 if media_type in ('VIDEO', 'AUDIO'):
1253                     group_id = media.get('GROUP-ID')
1254                     media_url = media.get('URI')
1255                     if media_url:
1256                         format_id = []
1257                         for v in (group_id, media.get('NAME')):
1258                             if v:
1259                                 format_id.append(v)
1260                         f = {
1261                             'format_id': '-'.join(format_id),
1262                             'url': format_url(media_url),
1263                             'language': media.get('LANGUAGE'),
1264                             'ext': ext,
1265                             'protocol': entry_protocol,
1266                             'preference': preference,
1267                         }
1268                         if media_type == 'AUDIO':
1269                             f['vcodec'] = 'none'
1270                             if group_id and not audio_in_video_stream.get(group_id):
1271                                 audio_in_video_stream[group_id] = False
1272                         formats.append(f)
1273                     else:
1274                         # When there is no URI in EXT-X-MEDIA let this tag's
1275                         # data be used by regular URI lines below
1276                         last_media = media
1277                         if media_type == 'AUDIO' and group_id:
1278                             audio_in_video_stream[group_id] = True
1279             elif line.startswith('#') or not line.strip():
1280                 continue
1281             else:
1282                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1283                 format_id = []
1284                 if m3u8_id:
1285                     format_id.append(m3u8_id)
1286                 # Despite specification does not mention NAME attribute for
1287                 # EXT-X-STREAM-INF it still sometimes may be present
1288                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1289                 # Bandwidth of live streams may differ over time thus making
1290                 # format_id unpredictable. So it's better to keep provided
1291                 # format_id intact.
1292                 if not live:
1293                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1294                 manifest_url = format_url(line.strip())
1295                 f = {
1296                     'format_id': '-'.join(format_id),
1297                     'url': manifest_url,
1298                     'manifest_url': manifest_url,
1299                     'tbr': tbr,
1300                     'ext': ext,
1301                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1302                     'protocol': entry_protocol,
1303                     'preference': preference,
1304                 }
1305                 resolution = last_info.get('RESOLUTION')
1306                 if resolution:
1307                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1308                     if mobj:
1309                         f['width'] = int(mobj.group('width'))
1310                         f['height'] = int(mobj.group('height'))
1311                 # Unified Streaming Platform
1312                 mobj = re.search(
1313                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1314                 if mobj:
1315                     abr, vbr = mobj.groups()
1316                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1317                     f.update({
1318                         'vbr': vbr,
1319                         'abr': abr,
1320                     })
1321                 f.update(parse_codecs(last_info.get('CODECS')))
1322                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1323                     # TODO: update acodec for audio only formats with the same GROUP-ID
1324                     f['acodec'] = 'none'
1325                 formats.append(f)
1326                 last_info = {}
1327                 last_media = {}
1328         return formats
1329
1330     @staticmethod
1331     def _xpath_ns(path, namespace=None):
1332         if not namespace:
1333             return path
1334         out = []
1335         for c in path.split('/'):
1336             if not c or c == '.':
1337                 out.append(c)
1338             else:
1339                 out.append('{%s}%s' % (namespace, c))
1340         return '/'.join(out)
1341
1342     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1343         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1344
1345         if smil is False:
1346             assert not fatal
1347             return []
1348
1349         namespace = self._parse_smil_namespace(smil)
1350
1351         return self._parse_smil_formats(
1352             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1353
1354     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1355         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1356         if smil is False:
1357             return {}
1358         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1359
1360     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1361         return self._download_xml(
1362             smil_url, video_id, 'Downloading SMIL file',
1363             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1364
1365     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1366         namespace = self._parse_smil_namespace(smil)
1367
1368         formats = self._parse_smil_formats(
1369             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1370         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1371
1372         video_id = os.path.splitext(url_basename(smil_url))[0]
1373         title = None
1374         description = None
1375         upload_date = None
1376         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1377             name = meta.attrib.get('name')
1378             content = meta.attrib.get('content')
1379             if not name or not content:
1380                 continue
1381             if not title and name == 'title':
1382                 title = content
1383             elif not description and name in ('description', 'abstract'):
1384                 description = content
1385             elif not upload_date and name == 'date':
1386                 upload_date = unified_strdate(content)
1387
1388         thumbnails = [{
1389             'id': image.get('type'),
1390             'url': image.get('src'),
1391             'width': int_or_none(image.get('width')),
1392             'height': int_or_none(image.get('height')),
1393         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1394
1395         return {
1396             'id': video_id,
1397             'title': title or video_id,
1398             'description': description,
1399             'upload_date': upload_date,
1400             'thumbnails': thumbnails,
1401             'formats': formats,
1402             'subtitles': subtitles,
1403         }
1404
1405     def _parse_smil_namespace(self, smil):
1406         return self._search_regex(
1407             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1408
1409     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1410         base = smil_url
1411         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1412             b = meta.get('base') or meta.get('httpBase')
1413             if b:
1414                 base = b
1415                 break
1416
1417         formats = []
1418         rtmp_count = 0
1419         http_count = 0
1420         m3u8_count = 0
1421
1422         srcs = []
1423         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1424         for medium in media:
1425             src = medium.get('src')
1426             if not src or src in srcs:
1427                 continue
1428             srcs.append(src)
1429
1430             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1431             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1432             width = int_or_none(medium.get('width'))
1433             height = int_or_none(medium.get('height'))
1434             proto = medium.get('proto')
1435             ext = medium.get('ext')
1436             src_ext = determine_ext(src)
1437             streamer = medium.get('streamer') or base
1438
1439             if proto == 'rtmp' or streamer.startswith('rtmp'):
1440                 rtmp_count += 1
1441                 formats.append({
1442                     'url': streamer,
1443                     'play_path': src,
1444                     'ext': 'flv',
1445                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1446                     'tbr': bitrate,
1447                     'filesize': filesize,
1448                     'width': width,
1449                     'height': height,
1450                 })
1451                 if transform_rtmp_url:
1452                     streamer, src = transform_rtmp_url(streamer, src)
1453                     formats[-1].update({
1454                         'url': streamer,
1455                         'play_path': src,
1456                     })
1457                 continue
1458
1459             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1460             src_url = src_url.strip()
1461
1462             if proto == 'm3u8' or src_ext == 'm3u8':
1463                 m3u8_formats = self._extract_m3u8_formats(
1464                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1465                 if len(m3u8_formats) == 1:
1466                     m3u8_count += 1
1467                     m3u8_formats[0].update({
1468                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1469                         'tbr': bitrate,
1470                         'width': width,
1471                         'height': height,
1472                     })
1473                 formats.extend(m3u8_formats)
1474                 continue
1475
1476             if src_ext == 'f4m':
1477                 f4m_url = src_url
1478                 if not f4m_params:
1479                     f4m_params = {
1480                         'hdcore': '3.2.0',
1481                         'plugin': 'flowplayer-3.2.0.1',
1482                     }
1483                 f4m_url += '&' if '?' in f4m_url else '?'
1484                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1485                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1486                 continue
1487
1488             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1489                 http_count += 1
1490                 formats.append({
1491                     'url': src_url,
1492                     'ext': ext or src_ext or 'flv',
1493                     'format_id': 'http-%d' % (bitrate or http_count),
1494                     'tbr': bitrate,
1495                     'filesize': filesize,
1496                     'width': width,
1497                     'height': height,
1498                 })
1499                 continue
1500
1501         return formats
1502
1503     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1504         urls = []
1505         subtitles = {}
1506         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1507             src = textstream.get('src')
1508             if not src or src in urls:
1509                 continue
1510             urls.append(src)
1511             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1512             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1513             subtitles.setdefault(lang, []).append({
1514                 'url': src,
1515                 'ext': ext,
1516             })
1517         return subtitles
1518
1519     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1520         xspf = self._download_xml(
1521             playlist_url, playlist_id, 'Downloading xpsf playlist',
1522             'Unable to download xspf manifest', fatal=fatal)
1523         if xspf is False:
1524             return []
1525         return self._parse_xspf(xspf, playlist_id)
1526
1527     def _parse_xspf(self, playlist, playlist_id):
1528         NS_MAP = {
1529             'xspf': 'http://xspf.org/ns/0/',
1530             's1': 'http://static.streamone.nl/player/ns/0',
1531         }
1532
1533         entries = []
1534         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1535             title = xpath_text(
1536                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1537             description = xpath_text(
1538                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1539             thumbnail = xpath_text(
1540                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1541             duration = float_or_none(
1542                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1543
1544             formats = [{
1545                 'url': location.text,
1546                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1547                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1548                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1549             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1550             self._sort_formats(formats)
1551
1552             entries.append({
1553                 'id': playlist_id,
1554                 'title': title,
1555                 'description': description,
1556                 'thumbnail': thumbnail,
1557                 'duration': duration,
1558                 'formats': formats,
1559             })
1560         return entries
1561
1562     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1563         res = self._download_webpage_handle(
1564             mpd_url, video_id,
1565             note=note or 'Downloading MPD manifest',
1566             errnote=errnote or 'Failed to download MPD manifest',
1567             fatal=fatal)
1568         if res is False:
1569             return []
1570         mpd, urlh = res
1571         mpd_base_url = base_url(urlh.geturl())
1572
1573         return self._parse_mpd_formats(
1574             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1575             formats_dict=formats_dict, mpd_url=mpd_url)
1576
1577     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1578         """
1579         Parse formats from MPD manifest.
1580         References:
1581          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1582             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1583          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1584         """
1585         if mpd_doc.get('type') == 'dynamic':
1586             return []
1587
1588         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1589
1590         def _add_ns(path):
1591             return self._xpath_ns(path, namespace)
1592
1593         def is_drm_protected(element):
1594             return element.find(_add_ns('ContentProtection')) is not None
1595
1596         def extract_multisegment_info(element, ms_parent_info):
1597             ms_info = ms_parent_info.copy()
1598
1599             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1600             # common attributes and elements.  We will only extract relevant
1601             # for us.
1602             def extract_common(source):
1603                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1604                 if segment_timeline is not None:
1605                     s_e = segment_timeline.findall(_add_ns('S'))
1606                     if s_e:
1607                         ms_info['total_number'] = 0
1608                         ms_info['s'] = []
1609                         for s in s_e:
1610                             r = int(s.get('r', 0))
1611                             ms_info['total_number'] += 1 + r
1612                             ms_info['s'].append({
1613                                 't': int(s.get('t', 0)),
1614                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1615                                 'd': int(s.attrib['d']),
1616                                 'r': r,
1617                             })
1618                 start_number = source.get('startNumber')
1619                 if start_number:
1620                     ms_info['start_number'] = int(start_number)
1621                 timescale = source.get('timescale')
1622                 if timescale:
1623                     ms_info['timescale'] = int(timescale)
1624                 segment_duration = source.get('duration')
1625                 if segment_duration:
1626                     ms_info['segment_duration'] = int(segment_duration)
1627
1628             def extract_Initialization(source):
1629                 initialization = source.find(_add_ns('Initialization'))
1630                 if initialization is not None:
1631                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1632
1633             segment_list = element.find(_add_ns('SegmentList'))
1634             if segment_list is not None:
1635                 extract_common(segment_list)
1636                 extract_Initialization(segment_list)
1637                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1638                 if segment_urls_e:
1639                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1640             else:
1641                 segment_template = element.find(_add_ns('SegmentTemplate'))
1642                 if segment_template is not None:
1643                     extract_common(segment_template)
1644                     media = segment_template.get('media')
1645                     if media:
1646                         ms_info['media'] = media
1647                     initialization = segment_template.get('initialization')
1648                     if initialization:
1649                         ms_info['initialization'] = initialization
1650                     else:
1651                         extract_Initialization(segment_template)
1652             return ms_info
1653
1654         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1655         formats = []
1656         for period in mpd_doc.findall(_add_ns('Period')):
1657             period_duration = parse_duration(period.get('duration')) or mpd_duration
1658             period_ms_info = extract_multisegment_info(period, {
1659                 'start_number': 1,
1660                 'timescale': 1,
1661             })
1662             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1663                 if is_drm_protected(adaptation_set):
1664                     continue
1665                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1666                 for representation in adaptation_set.findall(_add_ns('Representation')):
1667                     if is_drm_protected(representation):
1668                         continue
1669                     representation_attrib = adaptation_set.attrib.copy()
1670                     representation_attrib.update(representation.attrib)
1671                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1672                     mime_type = representation_attrib['mimeType']
1673                     content_type = mime_type.split('/')[0]
1674                     if content_type == 'text':
1675                         # TODO implement WebVTT downloading
1676                         pass
1677                     elif content_type == 'video' or content_type == 'audio':
1678                         base_url = ''
1679                         for element in (representation, adaptation_set, period, mpd_doc):
1680                             base_url_e = element.find(_add_ns('BaseURL'))
1681                             if base_url_e is not None:
1682                                 base_url = base_url_e.text + base_url
1683                                 if re.match(r'^https?://', base_url):
1684                                     break
1685                         if mpd_base_url and not re.match(r'^https?://', base_url):
1686                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1687                                 mpd_base_url += '/'
1688                             base_url = mpd_base_url + base_url
1689                         representation_id = representation_attrib.get('id')
1690                         lang = representation_attrib.get('lang')
1691                         url_el = representation.find(_add_ns('BaseURL'))
1692                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1693                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1694                         f = {
1695                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1696                             'url': base_url,
1697                             'manifest_url': mpd_url,
1698                             'ext': mimetype2ext(mime_type),
1699                             'width': int_or_none(representation_attrib.get('width')),
1700                             'height': int_or_none(representation_attrib.get('height')),
1701                             'tbr': int_or_none(bandwidth, 1000),
1702                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1703                             'fps': int_or_none(representation_attrib.get('frameRate')),
1704                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1705                             'format_note': 'DASH %s' % content_type,
1706                             'filesize': filesize,
1707                         }
1708                         f.update(parse_codecs(representation_attrib.get('codecs')))
1709                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1710
1711                         def prepare_template(template_name, identifiers):
1712                             t = representation_ms_info[template_name]
1713                             t = t.replace('$RepresentationID$', representation_id)
1714                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1715                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1716                             t.replace('$$', '$')
1717                             return t
1718
1719                         # @initialization is a regular template like @media one
1720                         # so it should be handled just the same way (see
1721                         # https://github.com/rg3/youtube-dl/issues/11605)
1722                         if 'initialization' in representation_ms_info:
1723                             initialization_template = prepare_template(
1724                                 'initialization',
1725                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1726                                 # $Time$ shall not be included for @initialization thus
1727                                 # only $Bandwidth$ remains
1728                                 ('Bandwidth', ))
1729                             representation_ms_info['initialization_url'] = initialization_template % {
1730                                 'Bandwidth': bandwidth,
1731                             }
1732
1733                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1734
1735                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1736
1737                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1738                             # can't be used at the same time
1739                             if '%(Number' in media_template and 's' not in representation_ms_info:
1740                                 segment_duration = None
1741                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1742                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1743                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1744                                 representation_ms_info['fragments'] = [{
1745                                     'url': media_template % {
1746                                         'Number': segment_number,
1747                                         'Bandwidth': bandwidth,
1748                                     },
1749                                     'duration': segment_duration,
1750                                 } for segment_number in range(
1751                                     representation_ms_info['start_number'],
1752                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1753                             else:
1754                                 # $Number*$ or $Time$ in media template with S list available
1755                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1756                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1757                                 representation_ms_info['fragments'] = []
1758                                 segment_time = 0
1759                                 segment_d = None
1760                                 segment_number = representation_ms_info['start_number']
1761
1762                                 def add_segment_url():
1763                                     segment_url = media_template % {
1764                                         'Time': segment_time,
1765                                         'Bandwidth': bandwidth,
1766                                         'Number': segment_number,
1767                                     }
1768                                     representation_ms_info['fragments'].append({
1769                                         'url': segment_url,
1770                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1771                                     })
1772
1773                                 for num, s in enumerate(representation_ms_info['s']):
1774                                     segment_time = s.get('t') or segment_time
1775                                     segment_d = s['d']
1776                                     add_segment_url()
1777                                     segment_number += 1
1778                                     for r in range(s.get('r', 0)):
1779                                         segment_time += segment_d
1780                                         add_segment_url()
1781                                         segment_number += 1
1782                                     segment_time += segment_d
1783                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1784                             # No media template
1785                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1786                             # or any YouTube dashsegments video
1787                             fragments = []
1788                             segment_index = 0
1789                             timescale = representation_ms_info['timescale']
1790                             for s in representation_ms_info['s']:
1791                                 duration = float_or_none(s['d'], timescale)
1792                                 for r in range(s.get('r', 0) + 1):
1793                                     fragments.append({
1794                                         'url': representation_ms_info['segment_urls'][segment_index],
1795                                         'duration': duration,
1796                                     })
1797                                     segment_index += 1
1798                             representation_ms_info['fragments'] = fragments
1799                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1800                         # No fragments key is present in this case.
1801                         if 'fragments' in representation_ms_info:
1802                             f.update({
1803                                 'fragments': [],
1804                                 'protocol': 'http_dash_segments',
1805                             })
1806                             if 'initialization_url' in representation_ms_info:
1807                                 initialization_url = representation_ms_info['initialization_url']
1808                                 if not f.get('url'):
1809                                     f['url'] = initialization_url
1810                                 f['fragments'].append({'url': initialization_url})
1811                             f['fragments'].extend(representation_ms_info['fragments'])
1812                             for fragment in f['fragments']:
1813                                 fragment['url'] = urljoin(base_url, fragment['url'])
1814                         try:
1815                             existing_format = next(
1816                                 fo for fo in formats
1817                                 if fo['format_id'] == representation_id)
1818                         except StopIteration:
1819                             full_info = formats_dict.get(representation_id, {}).copy()
1820                             full_info.update(f)
1821                             formats.append(full_info)
1822                         else:
1823                             existing_format.update(f)
1824                     else:
1825                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1826         return formats
1827
1828     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1829         res = self._download_webpage_handle(
1830             ism_url, video_id,
1831             note=note or 'Downloading ISM manifest',
1832             errnote=errnote or 'Failed to download ISM manifest',
1833             fatal=fatal)
1834         if res is False:
1835             return []
1836         ism, urlh = res
1837
1838         return self._parse_ism_formats(
1839             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1840
1841     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1842         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1843             return []
1844
1845         duration = int(ism_doc.attrib['Duration'])
1846         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1847
1848         formats = []
1849         for stream in ism_doc.findall('StreamIndex'):
1850             stream_type = stream.get('Type')
1851             if stream_type not in ('video', 'audio'):
1852                 continue
1853             url_pattern = stream.attrib['Url']
1854             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1855             stream_name = stream.get('Name')
1856             for track in stream.findall('QualityLevel'):
1857                 fourcc = track.get('FourCC')
1858                 # TODO: add support for WVC1 and WMAP
1859                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1860                     self.report_warning('%s is not a supported codec' % fourcc)
1861                     continue
1862                 tbr = int(track.attrib['Bitrate']) // 1000
1863                 width = int_or_none(track.get('MaxWidth'))
1864                 height = int_or_none(track.get('MaxHeight'))
1865                 sampling_rate = int_or_none(track.get('SamplingRate'))
1866
1867                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1868                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1869
1870                 fragments = []
1871                 fragment_ctx = {
1872                     'time': 0,
1873                 }
1874                 stream_fragments = stream.findall('c')
1875                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1876                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1877                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1878                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1879                     if not fragment_ctx['duration']:
1880                         try:
1881                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1882                         except IndexError:
1883                             next_fragment_time = duration
1884                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1885                     for _ in range(fragment_repeat):
1886                         fragments.append({
1887                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1888                             'duration': fragment_ctx['duration'] / stream_timescale,
1889                         })
1890                         fragment_ctx['time'] += fragment_ctx['duration']
1891
1892                 format_id = []
1893                 if ism_id:
1894                     format_id.append(ism_id)
1895                 if stream_name:
1896                     format_id.append(stream_name)
1897                 format_id.append(compat_str(tbr))
1898
1899                 formats.append({
1900                     'format_id': '-'.join(format_id),
1901                     'url': ism_url,
1902                     'manifest_url': ism_url,
1903                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1904                     'width': width,
1905                     'height': height,
1906                     'tbr': tbr,
1907                     'asr': sampling_rate,
1908                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1909                     'acodec': 'none' if stream_type == 'video' else fourcc,
1910                     'protocol': 'ism',
1911                     'fragments': fragments,
1912                     '_download_params': {
1913                         'duration': duration,
1914                         'timescale': stream_timescale,
1915                         'width': width or 0,
1916                         'height': height or 0,
1917                         'fourcc': fourcc,
1918                         'codec_private_data': track.get('CodecPrivateData'),
1919                         'sampling_rate': sampling_rate,
1920                         'channels': int_or_none(track.get('Channels', 2)),
1921                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1922                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1923                     },
1924                 })
1925         return formats
1926
1927     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1928         def absolute_url(video_url):
1929             return compat_urlparse.urljoin(base_url, video_url)
1930
1931         def parse_content_type(content_type):
1932             if not content_type:
1933                 return {}
1934             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1935             if ctr:
1936                 mimetype, codecs = ctr.groups()
1937                 f = parse_codecs(codecs)
1938                 f['ext'] = mimetype2ext(mimetype)
1939                 return f
1940             return {}
1941
1942         def _media_formats(src, cur_media_type):
1943             full_url = absolute_url(src)
1944             ext = determine_ext(full_url)
1945             if ext == 'm3u8':
1946                 is_plain_url = False
1947                 formats = self._extract_m3u8_formats(
1948                     full_url, video_id, ext='mp4',
1949                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1950             elif ext == 'mpd':
1951                 is_plain_url = False
1952                 formats = self._extract_mpd_formats(
1953                     full_url, video_id, mpd_id=mpd_id)
1954             else:
1955                 is_plain_url = True
1956                 formats = [{
1957                     'url': full_url,
1958                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1959                 }]
1960             return is_plain_url, formats
1961
1962         entries = []
1963         media_tags = [(media_tag, media_type, '')
1964                       for media_tag, media_type
1965                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
1966         media_tags.extend(re.findall(
1967             # We only allow video|audio followed by a whitespace or '>'.
1968             # Allowing more characters may end up in significant slow down (see
1969             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
1970             # http://www.porntrex.com/maps/videositemap.xml).
1971             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
1972         for media_tag, media_type, media_content in media_tags:
1973             media_info = {
1974                 'formats': [],
1975                 'subtitles': {},
1976             }
1977             media_attributes = extract_attributes(media_tag)
1978             src = media_attributes.get('src')
1979             if src:
1980                 _, formats = _media_formats(src, media_type)
1981                 media_info['formats'].extend(formats)
1982             media_info['thumbnail'] = media_attributes.get('poster')
1983             if media_content:
1984                 for source_tag in re.findall(r'<source[^>]+>', media_content):
1985                     source_attributes = extract_attributes(source_tag)
1986                     src = source_attributes.get('src')
1987                     if not src:
1988                         continue
1989                     is_plain_url, formats = _media_formats(src, media_type)
1990                     if is_plain_url:
1991                         f = parse_content_type(source_attributes.get('type'))
1992                         f.update(formats[0])
1993                         media_info['formats'].append(f)
1994                     else:
1995                         media_info['formats'].extend(formats)
1996                 for track_tag in re.findall(r'<track[^>]+>', media_content):
1997                     track_attributes = extract_attributes(track_tag)
1998                     kind = track_attributes.get('kind')
1999                     if not kind or kind in ('subtitles', 'captions'):
2000                         src = track_attributes.get('src')
2001                         if not src:
2002                             continue
2003                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2004                         media_info['subtitles'].setdefault(lang, []).append({
2005                             'url': absolute_url(src),
2006                         })
2007             if media_info['formats'] or media_info['subtitles']:
2008                 entries.append(media_info)
2009         return entries
2010
2011     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2012         formats = []
2013         hdcore_sign = 'hdcore=3.7.0'
2014         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2015         hds_host = hosts.get('hds')
2016         if hds_host:
2017             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2018         if 'hdcore=' not in f4m_url:
2019             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2020         f4m_formats = self._extract_f4m_formats(
2021             f4m_url, video_id, f4m_id='hds', fatal=False)
2022         for entry in f4m_formats:
2023             entry.update({'extra_param_to_segment_url': hdcore_sign})
2024         formats.extend(f4m_formats)
2025         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2026         hls_host = hosts.get('hls')
2027         if hls_host:
2028             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2029         formats.extend(self._extract_m3u8_formats(
2030             m3u8_url, video_id, 'mp4', 'm3u8_native',
2031             m3u8_id='hls', fatal=False))
2032         return formats
2033
2034     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2035         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2036         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2037         http_base_url = 'http' + url_base
2038         formats = []
2039         if 'm3u8' not in skip_protocols:
2040             formats.extend(self._extract_m3u8_formats(
2041                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2042                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2043         if 'f4m' not in skip_protocols:
2044             formats.extend(self._extract_f4m_formats(
2045                 http_base_url + '/manifest.f4m',
2046                 video_id, f4m_id='hds', fatal=False))
2047         if 'dash' not in skip_protocols:
2048             formats.extend(self._extract_mpd_formats(
2049                 http_base_url + '/manifest.mpd',
2050                 video_id, mpd_id='dash', fatal=False))
2051         if re.search(r'(?:/smil:|\.smil)', url_base):
2052             if 'smil' not in skip_protocols:
2053                 rtmp_formats = self._extract_smil_formats(
2054                     http_base_url + '/jwplayer.smil',
2055                     video_id, fatal=False)
2056                 for rtmp_format in rtmp_formats:
2057                     rtsp_format = rtmp_format.copy()
2058                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2059                     del rtsp_format['play_path']
2060                     del rtsp_format['ext']
2061                     rtsp_format.update({
2062                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2063                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2064                         'protocol': 'rtsp',
2065                     })
2066                     formats.extend([rtmp_format, rtsp_format])
2067         else:
2068             for protocol in ('rtmp', 'rtsp'):
2069                 if protocol not in skip_protocols:
2070                     formats.append({
2071                         'url': protocol + url_base,
2072                         'format_id': protocol,
2073                         'protocol': protocol,
2074                     })
2075         return formats
2076
2077     @staticmethod
2078     def _find_jwplayer_data(webpage):
2079         mobj = re.search(
2080             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2081             webpage)
2082         if mobj:
2083             return mobj.group('options')
2084
2085     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2086         jwplayer_data = self._parse_json(
2087             self._find_jwplayer_data(webpage), video_id,
2088             transform_source=js_to_json)
2089         return self._parse_jwplayer_data(
2090             jwplayer_data, video_id, *args, **kwargs)
2091
2092     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2093                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2094         # JWPlayer backward compatibility: flattened playlists
2095         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2096         if 'playlist' not in jwplayer_data:
2097             jwplayer_data = {'playlist': [jwplayer_data]}
2098
2099         entries = []
2100
2101         # JWPlayer backward compatibility: single playlist item
2102         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2103         if not isinstance(jwplayer_data['playlist'], list):
2104             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2105
2106         for video_data in jwplayer_data['playlist']:
2107             # JWPlayer backward compatibility: flattened sources
2108             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2109             if 'sources' not in video_data:
2110                 video_data['sources'] = [video_data]
2111
2112             this_video_id = video_id or video_data['mediaid']
2113
2114             formats = []
2115             for source in video_data['sources']:
2116                 source_url = self._proto_relative_url(source['file'])
2117                 if base_url:
2118                     source_url = compat_urlparse.urljoin(base_url, source_url)
2119                 source_type = source.get('type') or ''
2120                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2121                 if source_type == 'hls' or ext == 'm3u8':
2122                     formats.extend(self._extract_m3u8_formats(
2123                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2124                 elif ext == 'mpd':
2125                     formats.extend(self._extract_mpd_formats(
2126                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2127                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2128                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2129                     formats.append({
2130                         'url': source_url,
2131                         'vcodec': 'none',
2132                         'ext': ext,
2133                     })
2134                 else:
2135                     height = int_or_none(source.get('height'))
2136                     if height is None:
2137                         # Often no height is provided but there is a label in
2138                         # format like 1080p.
2139                         height = int_or_none(self._search_regex(
2140                             r'^(\d{3,})[pP]$', source.get('label') or '',
2141                             'height', default=None))
2142                     a_format = {
2143                         'url': source_url,
2144                         'width': int_or_none(source.get('width')),
2145                         'height': height,
2146                         'ext': ext,
2147                     }
2148                     if source_url.startswith('rtmp'):
2149                         a_format['ext'] = 'flv'
2150
2151                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2152                         # of jwplayer.flash.swf
2153                         rtmp_url_parts = re.split(
2154                             r'((?:mp4|mp3|flv):)', source_url, 1)
2155                         if len(rtmp_url_parts) == 3:
2156                             rtmp_url, prefix, play_path = rtmp_url_parts
2157                             a_format.update({
2158                                 'url': rtmp_url,
2159                                 'play_path': prefix + play_path,
2160                             })
2161                         if rtmp_params:
2162                             a_format.update(rtmp_params)
2163                     formats.append(a_format)
2164             self._sort_formats(formats)
2165
2166             subtitles = {}
2167             tracks = video_data.get('tracks')
2168             if tracks and isinstance(tracks, list):
2169                 for track in tracks:
2170                     if track.get('kind') != 'captions':
2171                         continue
2172                     track_url = urljoin(base_url, track.get('file'))
2173                     if not track_url:
2174                         continue
2175                     subtitles.setdefault(track.get('label') or 'en', []).append({
2176                         'url': self._proto_relative_url(track_url)
2177                     })
2178
2179             entries.append({
2180                 'id': this_video_id,
2181                 'title': video_data['title'] if require_title else video_data.get('title'),
2182                 'description': video_data.get('description'),
2183                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2184                 'timestamp': int_or_none(video_data.get('pubdate')),
2185                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2186                 'subtitles': subtitles,
2187                 'formats': formats,
2188             })
2189         if len(entries) == 1:
2190             return entries[0]
2191         else:
2192             return self.playlist_result(entries)
2193
2194     def _live_title(self, name):
2195         """ Generate the title for a live video """
2196         now = datetime.datetime.now()
2197         now_str = now.strftime('%Y-%m-%d %H:%M')
2198         return name + ' ' + now_str
2199
2200     def _int(self, v, name, fatal=False, **kwargs):
2201         res = int_or_none(v, **kwargs)
2202         if 'get_attr' in kwargs:
2203             print(getattr(v, kwargs['get_attr']))
2204         if res is None:
2205             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2206             if fatal:
2207                 raise ExtractorError(msg)
2208             else:
2209                 self._downloader.report_warning(msg)
2210         return res
2211
2212     def _float(self, v, name, fatal=False, **kwargs):
2213         res = float_or_none(v, **kwargs)
2214         if res is None:
2215             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2216             if fatal:
2217                 raise ExtractorError(msg)
2218             else:
2219                 self._downloader.report_warning(msg)
2220         return res
2221
2222     def _set_cookie(self, domain, name, value, expire_time=None):
2223         cookie = compat_cookiejar.Cookie(
2224             0, name, value, None, None, domain, None,
2225             None, '/', True, False, expire_time, '', None, None, None)
2226         self._downloader.cookiejar.set_cookie(cookie)
2227
2228     def _get_cookies(self, url):
2229         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2230         req = sanitized_Request(url)
2231         self._downloader.cookiejar.add_cookie_header(req)
2232         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2233
2234     def get_testcases(self, include_onlymatching=False):
2235         t = getattr(self, '_TEST', None)
2236         if t:
2237             assert not hasattr(self, '_TESTS'), \
2238                 '%s has _TEST and _TESTS' % type(self).__name__
2239             tests = [t]
2240         else:
2241             tests = getattr(self, '_TESTS', [])
2242         for t in tests:
2243             if not include_onlymatching and t.get('only_matching', False):
2244                 continue
2245             t['name'] = type(self).__name__[:-len('IE')]
2246             yield t
2247
2248     def is_suitable(self, age_limit):
2249         """ Test whether the extractor is generally suitable for the given
2250         age limit (i.e. pornographic sites are not, all others usually are) """
2251
2252         any_restricted = False
2253         for tc in self.get_testcases(include_onlymatching=False):
2254             if tc.get('playlist', []):
2255                 tc = tc['playlist'][0]
2256             is_restricted = age_restricted(
2257                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2258             if not is_restricted:
2259                 return True
2260             any_restricted = any_restricted or is_restricted
2261         return not any_restricted
2262
2263     def extract_subtitles(self, *args, **kwargs):
2264         if (self._downloader.params.get('writesubtitles', False) or
2265                 self._downloader.params.get('listsubtitles')):
2266             return self._get_subtitles(*args, **kwargs)
2267         return {}
2268
2269     def _get_subtitles(self, *args, **kwargs):
2270         raise NotImplementedError('This method must be implemented by subclasses')
2271
2272     @staticmethod
2273     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2274         """ Merge subtitle items for one language. Items with duplicated URLs
2275         will be dropped. """
2276         list1_urls = set([item['url'] for item in subtitle_list1])
2277         ret = list(subtitle_list1)
2278         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2279         return ret
2280
2281     @classmethod
2282     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2283         """ Merge two subtitle dictionaries, language by language. """
2284         ret = dict(subtitle_dict1)
2285         for lang in subtitle_dict2:
2286             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2287         return ret
2288
2289     def extract_automatic_captions(self, *args, **kwargs):
2290         if (self._downloader.params.get('writeautomaticsub', False) or
2291                 self._downloader.params.get('listsubtitles')):
2292             return self._get_automatic_captions(*args, **kwargs)
2293         return {}
2294
2295     def _get_automatic_captions(self, *args, **kwargs):
2296         raise NotImplementedError('This method must be implemented by subclasses')
2297
2298     def mark_watched(self, *args, **kwargs):
2299         if (self._downloader.params.get('mark_watched', False) and
2300                 (self._get_login_info()[0] is not None or
2301                     self._downloader.params.get('cookiefile') is not None)):
2302             self._mark_watched(*args, **kwargs)
2303
2304     def _mark_watched(self, *args, **kwargs):
2305         raise NotImplementedError('This method must be implemented by subclasses')
2306
2307     def geo_verification_headers(self):
2308         headers = {}
2309         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2310         if geo_verification_proxy:
2311             headers['Ytdl-request-proxy'] = geo_verification_proxy
2312         return headers
2313
2314     def _generic_id(self, url):
2315         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2316
2317     def _generic_title(self, url):
2318         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2319
2320
2321 class SearchInfoExtractor(InfoExtractor):
2322     """
2323     Base class for paged search queries extractors.
2324     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2325     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2326     """
2327
2328     @classmethod
2329     def _make_valid_url(cls):
2330         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2331
2332     @classmethod
2333     def suitable(cls, url):
2334         return re.match(cls._make_valid_url(), url) is not None
2335
2336     def _real_extract(self, query):
2337         mobj = re.match(self._make_valid_url(), query)
2338         if mobj is None:
2339             raise ExtractorError('Invalid search query "%s"' % query)
2340
2341         prefix = mobj.group('prefix')
2342         query = mobj.group('query')
2343         if prefix == '':
2344             return self._get_n_results(query, 1)
2345         elif prefix == 'all':
2346             return self._get_n_results(query, self._MAX_RESULTS)
2347         else:
2348             n = int(prefix)
2349             if n <= 0:
2350                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2351             elif n > self._MAX_RESULTS:
2352                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2353                 n = self._MAX_RESULTS
2354             return self._get_n_results(query, n)
2355
2356     def _get_n_results(self, query, n):
2357         """Get a specified number of results for a query"""
2358         raise NotImplementedError('This method must be implemented by subclasses')
2359
2360     @property
2361     def SEARCH_KEY(self):
2362         return self._SEARCH_KEY