Merge branch 'master' of github.com-rndusr:rg3/youtube-dl into fix/str-item-assignment
[youtube-dl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import random
10 import re
11 import socket
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17     compat_cookiejar,
18     compat_cookies,
19     compat_etree_fromstring,
20     compat_getpass,
21     compat_http_client,
22     compat_os_name,
23     compat_str,
24     compat_urllib_error,
25     compat_urllib_parse_unquote,
26     compat_urllib_parse_urlencode,
27     compat_urllib_request,
28     compat_urlparse,
29 )
30 from ..downloader.f4m import remove_encrypted_media
31 from ..utils import (
32     NO_DEFAULT,
33     age_restricted,
34     base_url,
35     bug_reports_message,
36     clean_html,
37     compiled_regex_type,
38     determine_ext,
39     determine_protocol,
40     error_to_compat_str,
41     ExtractorError,
42     extract_attributes,
43     fix_xml_ampersands,
44     float_or_none,
45     GeoRestrictedError,
46     GeoUtils,
47     int_or_none,
48     js_to_json,
49     mimetype2ext,
50     orderedSet,
51     parse_codecs,
52     parse_duration,
53     parse_iso8601,
54     parse_m3u8_attributes,
55     RegexNotFoundError,
56     sanitized_Request,
57     sanitize_filename,
58     unescapeHTML,
59     unified_strdate,
60     unified_timestamp,
61     update_Request,
62     update_url_query,
63     urljoin,
64     url_basename,
65     xpath_element,
66     xpath_text,
67     xpath_with_ns,
68 )
69
70
71 class InfoExtractor(object):
72     """Information Extractor class.
73
74     Information extractors are the classes that, given a URL, extract
75     information about the video (or videos) the URL refers to. This
76     information includes the real video URL, the video title, author and
77     others. The information is stored in a dictionary which is then
78     passed to the YoutubeDL. The YoutubeDL processes this
79     information possibly downloading the video to the file system, among
80     other possible outcomes.
81
82     The type field determines the type of the result.
83     By far the most common value (and the default if _type is missing) is
84     "video", which indicates a single video.
85
86     For a video, the dictionaries must include the following fields:
87
88     id:             Video identifier.
89     title:          Video title, unescaped.
90
91     Additionally, it must contain either a formats entry or a url one:
92
93     formats:        A list of dictionaries for each format available, ordered
94                     from worst to best quality.
95
96                     Potential fields:
97                     * url        Mandatory. The URL of the video file
98                     * manifest_url
99                                  The URL of the manifest file in case of
100                                  fragmented media (DASH, hls, hds)
101                     * ext        Will be calculated from URL if missing
102                     * format     A human-readable description of the format
103                                  ("mp4 container with h264/opus").
104                                  Calculated from the format_id, width, height.
105                                  and format_note fields if missing.
106                     * format_id  A short description of the format
107                                  ("mp4_h264_opus" or "19").
108                                 Technically optional, but strongly recommended.
109                     * format_note Additional info about the format
110                                  ("3D" or "DASH video")
111                     * width      Width of the video, if known
112                     * height     Height of the video, if known
113                     * resolution Textual description of width and height
114                     * tbr        Average bitrate of audio and video in KBit/s
115                     * abr        Average audio bitrate in KBit/s
116                     * acodec     Name of the audio codec in use
117                     * asr        Audio sampling rate in Hertz
118                     * vbr        Average video bitrate in KBit/s
119                     * fps        Frame rate
120                     * vcodec     Name of the video codec in use
121                     * container  Name of the container format
122                     * filesize   The number of bytes, if known in advance
123                     * filesize_approx  An estimate for the number of bytes
124                     * player_url SWF Player URL (used for rtmpdump).
125                     * protocol   The protocol that will be used for the actual
126                                  download, lower-case.
127                                  "http", "https", "rtsp", "rtmp", "rtmpe",
128                                  "m3u8", "m3u8_native" or "http_dash_segments".
129                     * fragment_base_url
130                                  Base URL for fragments. Each fragment's path
131                                  value (if present) will be relative to
132                                  this URL.
133                     * fragments  A list of fragments of a fragmented media.
134                                  Each fragment entry must contain either an url
135                                  or a path. If an url is present it should be
136                                  considered by a client. Otherwise both path and
137                                  fragment_base_url must be present. Here is
138                                  the list of all potential fields:
139                                  * "url" - fragment's URL
140                                  * "path" - fragment's path relative to
141                                             fragment_base_url
142                                  * "duration" (optional, int or float)
143                                  * "filesize" (optional, int)
144                     * preference Order number of this format. If this field is
145                                  present and not None, the formats get sorted
146                                  by this field, regardless of all other values.
147                                  -1 for default (order by other properties),
148                                  -2 or smaller for less than default.
149                                  < -1000 to hide the format (if there is
150                                     another one which is strictly better)
151                     * language   Language code, e.g. "de" or "en-US".
152                     * language_preference  Is this in the language mentioned in
153                                  the URL?
154                                  10 if it's what the URL is about,
155                                  -1 for default (don't know),
156                                  -10 otherwise, other values reserved for now.
157                     * quality    Order number of the video quality of this
158                                  format, irrespective of the file format.
159                                  -1 for default (order by other properties),
160                                  -2 or smaller for less than default.
161                     * source_preference  Order number for this video source
162                                   (quality takes higher priority)
163                                  -1 for default (order by other properties),
164                                  -2 or smaller for less than default.
165                     * http_headers  A dictionary of additional HTTP headers
166                                  to add to the request.
167                     * stretched_ratio  If given and not 1, indicates that the
168                                  video's pixels are not square.
169                                  width : height ratio as float.
170                     * no_resume  The server does not support resuming the
171                                  (HTTP or RTMP) download. Boolean.
172
173     url:            Final video URL.
174     ext:            Video filename extension.
175     format:         The video format, defaults to ext (used for --get-format)
176     player_url:     SWF Player URL (used for rtmpdump).
177
178     The following fields are optional:
179
180     alt_title:      A secondary title of the video.
181     display_id      An alternative identifier for the video, not necessarily
182                     unique, but available before title. Typically, id is
183                     something like "4234987", title "Dancing naked mole rats",
184                     and display_id "dancing-naked-mole-rats"
185     thumbnails:     A list of dictionaries, with the following entries:
186                         * "id" (optional, string) - Thumbnail format ID
187                         * "url"
188                         * "preference" (optional, int) - quality of the image
189                         * "width" (optional, int)
190                         * "height" (optional, int)
191                         * "resolution" (optional, string "{width}x{height"},
192                                         deprecated)
193                         * "filesize" (optional, int)
194     thumbnail:      Full URL to a video thumbnail image.
195     description:    Full video description.
196     uploader:       Full name of the video uploader.
197     license:        License name the video is licensed under.
198     creator:        The creator of the video.
199     release_date:   The date (YYYYMMDD) when the video was released.
200     timestamp:      UNIX timestamp of the moment the video became available.
201     upload_date:    Video upload date (YYYYMMDD).
202                     If not explicitly set, calculated from timestamp.
203     uploader_id:    Nickname or id of the video uploader.
204     uploader_url:   Full URL to a personal webpage of the video uploader.
205     location:       Physical location where the video was filmed.
206     subtitles:      The available subtitles as a dictionary in the format
207                     {tag: subformats}. "tag" is usually a language code, and
208                     "subformats" is a list sorted from lower to higher
209                     preference, each element is a dictionary with the "ext"
210                     entry and one of:
211                         * "data": The subtitles file contents
212                         * "url": A URL pointing to the subtitles file
213                     "ext" will be calculated from URL if missing
214     automatic_captions: Like 'subtitles', used by the YoutubeIE for
215                     automatically generated captions
216     duration:       Length of the video in seconds, as an integer or float.
217     view_count:     How many users have watched the video on the platform.
218     like_count:     Number of positive ratings of the video
219     dislike_count:  Number of negative ratings of the video
220     repost_count:   Number of reposts of the video
221     average_rating: Average rating give by users, the scale used depends on the webpage
222     comment_count:  Number of comments on the video
223     comments:       A list of comments, each with one or more of the following
224                     properties (all but one of text or html optional):
225                         * "author" - human-readable name of the comment author
226                         * "author_id" - user ID of the comment author
227                         * "id" - Comment ID
228                         * "html" - Comment as HTML
229                         * "text" - Plain text of the comment
230                         * "timestamp" - UNIX timestamp of comment
231                         * "parent" - ID of the comment this one is replying to.
232                                      Set to "root" to indicate that this is a
233                                      comment to the original video.
234     age_limit:      Age restriction for the video, as an integer (years)
235     webpage_url:    The URL to the video webpage, if given to youtube-dl it
236                     should allow to get the same result again. (It will be set
237                     by YoutubeDL if it's missing)
238     categories:     A list of categories that the video falls in, for example
239                     ["Sports", "Berlin"]
240     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
241     is_live:        True, False, or None (=unknown). Whether this video is a
242                     live stream that goes on instead of a fixed-length video.
243     start_time:     Time in seconds where the reproduction should start, as
244                     specified in the URL.
245     end_time:       Time in seconds where the reproduction should end, as
246                     specified in the URL.
247
248     The following fields should only be used when the video belongs to some logical
249     chapter or section:
250
251     chapter:        Name or title of the chapter the video belongs to.
252     chapter_number: Number of the chapter the video belongs to, as an integer.
253     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
254
255     The following fields should only be used when the video is an episode of some
256     series, programme or podcast:
257
258     series:         Title of the series or programme the video episode belongs to.
259     season:         Title of the season the video episode belongs to.
260     season_number:  Number of the season the video episode belongs to, as an integer.
261     season_id:      Id of the season the video episode belongs to, as a unicode string.
262     episode:        Title of the video episode. Unlike mandatory video title field,
263                     this field should denote the exact title of the video episode
264                     without any kind of decoration.
265     episode_number: Number of the video episode within a season, as an integer.
266     episode_id:     Id of the video episode, as a unicode string.
267
268     The following fields should only be used when the media is a track or a part of
269     a music album:
270
271     track:          Title of the track.
272     track_number:   Number of the track within an album or a disc, as an integer.
273     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
274                     as a unicode string.
275     artist:         Artist(s) of the track.
276     genre:          Genre(s) of the track.
277     album:          Title of the album the track belongs to.
278     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
279     album_artist:   List of all artists appeared on the album (e.g.
280                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281                     and compilations).
282     disc_number:    Number of the disc or other physical medium the track belongs to,
283                     as an integer.
284     release_year:   Year (YYYY) when the album was released.
285
286     Unless mentioned otherwise, the fields should be Unicode strings.
287
288     Unless mentioned otherwise, None is equivalent to absence of information.
289
290
291     _type "playlist" indicates multiple videos.
292     There must be a key "entries", which is a list, an iterable, or a PagedList
293     object, each element of which is a valid dictionary by this specification.
294
295     Additionally, playlists can have "title", "description" and "id" attributes
296     with the same semantics as videos (see above).
297
298
299     _type "multi_video" indicates that there are multiple videos that
300     form a single show, for examples multiple acts of an opera or TV episode.
301     It must have an entries key like a playlist and contain all the keys
302     required for a video at the same time.
303
304
305     _type "url" indicates that the video must be extracted from another
306     location, possibly by a different extractor. Its only required key is:
307     "url" - the next URL to extract.
308     The key "ie_key" can be set to the class name (minus the trailing "IE",
309     e.g. "Youtube") if the extractor class is known in advance.
310     Additionally, the dictionary may have any properties of the resolved entity
311     known in advance, for example "title" if the title of the referred video is
312     known ahead of time.
313
314
315     _type "url_transparent" entities have the same specification as "url", but
316     indicate that the given additional information is more precise than the one
317     associated with the resolved URL.
318     This is useful when a site employs a video service that hosts the video and
319     its technical metadata, but that video service does not embed a useful
320     title, description etc.
321
322
323     Subclasses of this one should re-define the _real_initialize() and
324     _real_extract() methods and define a _VALID_URL regexp.
325     Probably, they should also be added to the list of extractors.
326
327     _GEO_BYPASS attribute may be set to False in order to disable
328     geo restriction bypass mechanisms for a particular extractor.
329     Though it won't disable explicit geo restriction bypass based on
330     country code provided with geo_bypass_country. (experimental)
331
332     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
333     countries for this extractor. One of these countries will be used by
334     geo restriction bypass mechanism right away in order to bypass
335     geo restriction, of course, if the mechanism is not disabled. (experimental)
336
337     NB: both these geo attributes are experimental and may change in future
338     or be completely removed.
339
340     Finally, the _WORKING attribute should be set to False for broken IEs
341     in order to warn the users and skip the tests.
342     """
343
344     _ready = False
345     _downloader = None
346     _x_forwarded_for_ip = None
347     _GEO_BYPASS = True
348     _GEO_COUNTRIES = None
349     _WORKING = True
350
351     def __init__(self, downloader=None):
352         """Constructor. Receives an optional downloader."""
353         self._ready = False
354         self._x_forwarded_for_ip = None
355         self.set_downloader(downloader)
356
357     @classmethod
358     def suitable(cls, url):
359         """Receives a URL and returns True if suitable for this IE."""
360
361         # This does not use has/getattr intentionally - we want to know whether
362         # we have cached the regexp for *this* class, whereas getattr would also
363         # match the superclass
364         if '_VALID_URL_RE' not in cls.__dict__:
365             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
366         return cls._VALID_URL_RE.match(url) is not None
367
368     @classmethod
369     def _match_id(cls, url):
370         if '_VALID_URL_RE' not in cls.__dict__:
371             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
372         m = cls._VALID_URL_RE.match(url)
373         assert m
374         return m.group('id')
375
376     @classmethod
377     def working(cls):
378         """Getter method for _WORKING."""
379         return cls._WORKING
380
381     def initialize(self):
382         """Initializes an instance (authentication, etc)."""
383         self._initialize_geo_bypass(self._GEO_COUNTRIES)
384         if not self._ready:
385             self._real_initialize()
386             self._ready = True
387
388     def _initialize_geo_bypass(self, countries):
389         """
390         Initialize geo restriction bypass mechanism.
391
392         This method is used to initialize geo bypass mechanism based on faking
393         X-Forwarded-For HTTP header. A random country from provided country list
394         is selected and a random IP belonging to this country is generated. This
395         IP will be passed as X-Forwarded-For HTTP header in all subsequent
396         HTTP requests.
397
398         This method will be used for initial geo bypass mechanism initialization
399         during the instance initialization with _GEO_COUNTRIES.
400
401         You may also manually call it from extractor's code if geo countries
402         information is not available beforehand (e.g. obtained during
403         extraction) or due to some another reason.
404         """
405         if not self._x_forwarded_for_ip:
406             country_code = self._downloader.params.get('geo_bypass_country', None)
407             # If there is no explicit country for geo bypass specified and
408             # the extractor is known to be geo restricted let's fake IP
409             # as X-Forwarded-For right away.
410             if (not country_code and
411                     self._GEO_BYPASS and
412                     self._downloader.params.get('geo_bypass', True) and
413                     countries):
414                 country_code = random.choice(countries)
415             if country_code:
416                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
417                 if self._downloader.params.get('verbose', False):
418                     self._downloader.to_stdout(
419                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
420                         % (self._x_forwarded_for_ip, country_code.upper()))
421
422     def extract(self, url):
423         """Extracts URL information and returns it in list of dicts."""
424         try:
425             for _ in range(2):
426                 try:
427                     self.initialize()
428                     ie_result = self._real_extract(url)
429                     if self._x_forwarded_for_ip:
430                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
431                     return ie_result
432                 except GeoRestrictedError as e:
433                     if self.__maybe_fake_ip_and_retry(e.countries):
434                         continue
435                     raise
436         except ExtractorError:
437             raise
438         except compat_http_client.IncompleteRead as e:
439             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
440         except (KeyError, StopIteration) as e:
441             raise ExtractorError('An extractor error has occurred.', cause=e)
442
443     def __maybe_fake_ip_and_retry(self, countries):
444         if (not self._downloader.params.get('geo_bypass_country', None) and
445                 self._GEO_BYPASS and
446                 self._downloader.params.get('geo_bypass', True) and
447                 not self._x_forwarded_for_ip and
448                 countries):
449             country_code = random.choice(countries)
450             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
451             if self._x_forwarded_for_ip:
452                 self.report_warning(
453                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
454                     % (self._x_forwarded_for_ip, country_code.upper()))
455                 return True
456         return False
457
458     def set_downloader(self, downloader):
459         """Sets the downloader for this IE."""
460         self._downloader = downloader
461
462     def _real_initialize(self):
463         """Real initialization process. Redefine in subclasses."""
464         pass
465
466     def _real_extract(self, url):
467         """Real extraction process. Redefine in subclasses."""
468         pass
469
470     @classmethod
471     def ie_key(cls):
472         """A string for getting the InfoExtractor with get_info_extractor"""
473         return compat_str(cls.__name__[:-2])
474
475     @property
476     def IE_NAME(self):
477         return compat_str(type(self).__name__[:-2])
478
479     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
480         """ Returns the response handle """
481         if note is None:
482             self.report_download_webpage(video_id)
483         elif note is not False:
484             if video_id is None:
485                 self.to_screen('%s' % (note,))
486             else:
487                 self.to_screen('%s: %s' % (video_id, note))
488         if isinstance(url_or_request, compat_urllib_request.Request):
489             url_or_request = update_Request(
490                 url_or_request, data=data, headers=headers, query=query)
491         else:
492             if query:
493                 url_or_request = update_url_query(url_or_request, query)
494             if data is not None or headers:
495                 url_or_request = sanitized_Request(url_or_request, data, headers)
496         try:
497             return self._downloader.urlopen(url_or_request)
498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
499             if errnote is False:
500                 return False
501             if errnote is None:
502                 errnote = 'Unable to download webpage'
503
504             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
505             if fatal:
506                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
507             else:
508                 self._downloader.report_warning(errmsg)
509                 return False
510
511     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
512         """ Returns a tuple (page content as string, URL handle) """
513         # Strip hashes from the URL (#1038)
514         if isinstance(url_or_request, (compat_str, str)):
515             url_or_request = url_or_request.partition('#')[0]
516
517         # Some sites check X-Forwarded-For HTTP header in order to figure out
518         # the origin of the client behind proxy. This allows bypassing geo
519         # restriction by faking this header's value to IP that belongs to some
520         # geo unrestricted country. We will do so once we encounter any
521         # geo restriction error.
522         if self._x_forwarded_for_ip:
523             if 'X-Forwarded-For' not in headers:
524                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
525
526         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
527         if urlh is False:
528             assert not fatal
529             return False
530         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
531         return (content, urlh)
532
533     @staticmethod
534     def _guess_encoding_from_content(content_type, webpage_bytes):
535         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
536         if m:
537             encoding = m.group(1)
538         else:
539             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
540                           webpage_bytes[:1024])
541             if m:
542                 encoding = m.group(1).decode('ascii')
543             elif webpage_bytes.startswith(b'\xff\xfe'):
544                 encoding = 'utf-16'
545             else:
546                 encoding = 'utf-8'
547
548         return encoding
549
550     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
551         content_type = urlh.headers.get('Content-Type', '')
552         webpage_bytes = urlh.read()
553         if prefix is not None:
554             webpage_bytes = prefix + webpage_bytes
555         if not encoding:
556             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
557         if self._downloader.params.get('dump_intermediate_pages', False):
558             try:
559                 url = url_or_request.get_full_url()
560             except AttributeError:
561                 url = url_or_request
562             self.to_screen('Dumping request to ' + url)
563             dump = base64.b64encode(webpage_bytes).decode('ascii')
564             self._downloader.to_screen(dump)
565         if self._downloader.params.get('write_pages', False):
566             try:
567                 url = url_or_request.get_full_url()
568             except AttributeError:
569                 url = url_or_request
570             basen = '%s_%s' % (video_id, url)
571             if len(basen) > 240:
572                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
573                 basen = basen[:240 - len(h)] + h
574             raw_filename = basen + '.dump'
575             filename = sanitize_filename(raw_filename, restricted=True)
576             self.to_screen('Saving request to ' + filename)
577             # Working around MAX_PATH limitation on Windows (see
578             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
579             if compat_os_name == 'nt':
580                 absfilepath = os.path.abspath(filename)
581                 if len(absfilepath) > 259:
582                     filename = '\\\\?\\' + absfilepath
583             with open(filename, 'wb') as outf:
584                 outf.write(webpage_bytes)
585
586         try:
587             content = webpage_bytes.decode(encoding, 'replace')
588         except LookupError:
589             content = webpage_bytes.decode('utf-8', 'replace')
590
591         if ('<title>Access to this site is blocked</title>' in content and
592                 'Websense' in content[:512]):
593             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
594             blocked_iframe = self._html_search_regex(
595                 r'<iframe src="([^"]+)"', content,
596                 'Websense information URL', default=None)
597             if blocked_iframe:
598                 msg += ' Visit %s for more details' % blocked_iframe
599             raise ExtractorError(msg, expected=True)
600         if '<title>The URL you requested has been blocked</title>' in content[:512]:
601             msg = (
602                 'Access to this webpage has been blocked by Indian censorship. '
603                 'Use a VPN or proxy server (with --proxy) to route around it.')
604             block_msg = self._html_search_regex(
605                 r'</h1><p>(.*?)</p>',
606                 content, 'block message', default=None)
607             if block_msg:
608                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
609             raise ExtractorError(msg, expected=True)
610
611         return content
612
613     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
614         """ Returns the data of the page as a string """
615         success = False
616         try_count = 0
617         while success is False:
618             try:
619                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
620                 success = True
621             except compat_http_client.IncompleteRead as e:
622                 try_count += 1
623                 if try_count >= tries:
624                     raise e
625                 self._sleep(timeout, video_id)
626         if res is False:
627             return res
628         else:
629             content, _ = res
630             return content
631
632     def _download_xml(self, url_or_request, video_id,
633                       note='Downloading XML', errnote='Unable to download XML',
634                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
635         """Return the xml as an xml.etree.ElementTree.Element"""
636         xml_string = self._download_webpage(
637             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
638         if xml_string is False:
639             return xml_string
640         if transform_source:
641             xml_string = transform_source(xml_string)
642         return compat_etree_fromstring(xml_string.encode('utf-8'))
643
644     def _download_json(self, url_or_request, video_id,
645                        note='Downloading JSON metadata',
646                        errnote='Unable to download JSON metadata',
647                        transform_source=None,
648                        fatal=True, encoding=None, data=None, headers={}, query={}):
649         json_string = self._download_webpage(
650             url_or_request, video_id, note, errnote, fatal=fatal,
651             encoding=encoding, data=data, headers=headers, query=query)
652         if (not fatal) and json_string is False:
653             return None
654         return self._parse_json(
655             json_string, video_id, transform_source=transform_source, fatal=fatal)
656
657     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
658         if transform_source:
659             json_string = transform_source(json_string)
660         try:
661             return json.loads(json_string)
662         except ValueError as ve:
663             errmsg = '%s: Failed to parse JSON ' % video_id
664             if fatal:
665                 raise ExtractorError(errmsg, cause=ve)
666             else:
667                 self.report_warning(errmsg + str(ve))
668
669     def report_warning(self, msg, video_id=None):
670         idstr = '' if video_id is None else '%s: ' % video_id
671         self._downloader.report_warning(
672             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
673
674     def to_screen(self, msg):
675         """Print msg to screen, prefixing it with '[ie_name]'"""
676         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
677
678     def report_extraction(self, id_or_name):
679         """Report information extraction."""
680         self.to_screen('%s: Extracting information' % id_or_name)
681
682     def report_download_webpage(self, video_id):
683         """Report webpage download."""
684         self.to_screen('%s: Downloading webpage' % video_id)
685
686     def report_age_confirmation(self):
687         """Report attempt to confirm age."""
688         self.to_screen('Confirming age')
689
690     def report_login(self):
691         """Report attempt to log in."""
692         self.to_screen('Logging in')
693
694     @staticmethod
695     def raise_login_required(msg='This video is only available for registered users'):
696         raise ExtractorError(
697             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
698             expected=True)
699
700     @staticmethod
701     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
702         raise GeoRestrictedError(msg, countries=countries)
703
704     # Methods for following #608
705     @staticmethod
706     def url_result(url, ie=None, video_id=None, video_title=None):
707         """Returns a URL that points to a page that should be processed"""
708         # TODO: ie should be the class used for getting the info
709         video_info = {'_type': 'url',
710                       'url': url,
711                       'ie_key': ie}
712         if video_id is not None:
713             video_info['id'] = video_id
714         if video_title is not None:
715             video_info['title'] = video_title
716         return video_info
717
718     def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
719         urlrs = orderedSet(
720             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
721             for m in matches)
722         return self.playlist_result(
723             urlrs, playlist_id=video_id, playlist_title=video_title)
724
725     @staticmethod
726     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
727         """Returns a playlist"""
728         video_info = {'_type': 'playlist',
729                       'entries': entries}
730         if playlist_id:
731             video_info['id'] = playlist_id
732         if playlist_title:
733             video_info['title'] = playlist_title
734         if playlist_description:
735             video_info['description'] = playlist_description
736         return video_info
737
738     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
739         """
740         Perform a regex search on the given string, using a single or a list of
741         patterns returning the first matching group.
742         In case of failure return a default value or raise a WARNING or a
743         RegexNotFoundError, depending on fatal, specifying the field name.
744         """
745         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
746             mobj = re.search(pattern, string, flags)
747         else:
748             for p in pattern:
749                 mobj = re.search(p, string, flags)
750                 if mobj:
751                     break
752
753         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
754             _name = '\033[0;34m%s\033[0m' % name
755         else:
756             _name = name
757
758         if mobj:
759             if group is None:
760                 # return the first matching group
761                 return next(g for g in mobj.groups() if g is not None)
762             else:
763                 return mobj.group(group)
764         elif default is not NO_DEFAULT:
765             return default
766         elif fatal:
767             raise RegexNotFoundError('Unable to extract %s' % _name)
768         else:
769             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
770             return None
771
772     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
773         """
774         Like _search_regex, but strips HTML tags and unescapes entities.
775         """
776         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
777         if res:
778             return clean_html(res).strip()
779         else:
780             return res
781
782     def _get_netrc_login_info(self, netrc_machine=None):
783         username = None
784         password = None
785         netrc_machine = netrc_machine or self._NETRC_MACHINE
786
787         if self._downloader.params.get('usenetrc', False):
788             try:
789                 info = netrc.netrc().authenticators(netrc_machine)
790                 if info is not None:
791                     username = info[0]
792                     password = info[2]
793                 else:
794                     raise netrc.NetrcParseError(
795                         'No authenticators for %s' % netrc_machine)
796             except (IOError, netrc.NetrcParseError) as err:
797                 self._downloader.report_warning(
798                     'parsing .netrc: %s' % error_to_compat_str(err))
799
800         return username, password
801
802     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
803         """
804         Get the login info as (username, password)
805         First look for the manually specified credentials using username_option
806         and password_option as keys in params dictionary. If no such credentials
807         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
808         value.
809         If there's no info available, return (None, None)
810         """
811         if self._downloader is None:
812             return (None, None)
813
814         downloader_params = self._downloader.params
815
816         # Attempt to use provided username and password or .netrc data
817         if downloader_params.get(username_option) is not None:
818             username = downloader_params[username_option]
819             password = downloader_params[password_option]
820         else:
821             username, password = self._get_netrc_login_info(netrc_machine)
822
823         return username, password
824
825     def _get_tfa_info(self, note='two-factor verification code'):
826         """
827         Get the two-factor authentication info
828         TODO - asking the user will be required for sms/phone verify
829         currently just uses the command line option
830         If there's no info available, return None
831         """
832         if self._downloader is None:
833             return None
834         downloader_params = self._downloader.params
835
836         if downloader_params.get('twofactor') is not None:
837             return downloader_params['twofactor']
838
839         return compat_getpass('Type %s and press [Return]: ' % note)
840
841     # Helper functions for extracting OpenGraph info
842     @staticmethod
843     def _og_regexes(prop):
844         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
845         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
846                        % {'prop': re.escape(prop)})
847         template = r'<meta[^>]+?%s[^>]+?%s'
848         return [
849             template % (property_re, content_re),
850             template % (content_re, property_re),
851         ]
852
853     @staticmethod
854     def _meta_regex(prop):
855         return r'''(?isx)<meta
856                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
857                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
858
859     def _og_search_property(self, prop, html, name=None, **kargs):
860         if not isinstance(prop, (list, tuple)):
861             prop = [prop]
862         if name is None:
863             name = 'OpenGraph %s' % prop[0]
864         og_regexes = []
865         for p in prop:
866             og_regexes.extend(self._og_regexes(p))
867         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
868         if escaped is None:
869             return None
870         return unescapeHTML(escaped)
871
872     def _og_search_thumbnail(self, html, **kargs):
873         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
874
875     def _og_search_description(self, html, **kargs):
876         return self._og_search_property('description', html, fatal=False, **kargs)
877
878     def _og_search_title(self, html, **kargs):
879         return self._og_search_property('title', html, **kargs)
880
881     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
882         regexes = self._og_regexes('video') + self._og_regexes('video:url')
883         if secure:
884             regexes = self._og_regexes('video:secure_url') + regexes
885         return self._html_search_regex(regexes, html, name, **kargs)
886
887     def _og_search_url(self, html, **kargs):
888         return self._og_search_property('url', html, **kargs)
889
890     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
891         if not isinstance(name, (list, tuple)):
892             name = [name]
893         if display_name is None:
894             display_name = name[0]
895         return self._html_search_regex(
896             [self._meta_regex(n) for n in name],
897             html, display_name, fatal=fatal, group='content', **kwargs)
898
899     def _dc_search_uploader(self, html):
900         return self._html_search_meta('dc.creator', html, 'uploader')
901
902     def _rta_search(self, html):
903         # See http://www.rtalabel.org/index.php?content=howtofaq#single
904         if re.search(r'(?ix)<meta\s+name="rating"\s+'
905                      r'     content="RTA-5042-1996-1400-1577-RTA"',
906                      html):
907             return 18
908         return 0
909
910     def _media_rating_search(self, html):
911         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
912         rating = self._html_search_meta('rating', html)
913
914         if not rating:
915             return None
916
917         RATING_TABLE = {
918             'safe for kids': 0,
919             'general': 8,
920             '14 years': 14,
921             'mature': 17,
922             'restricted': 19,
923         }
924         return RATING_TABLE.get(rating.lower())
925
926     def _family_friendly_search(self, html):
927         # See http://schema.org/VideoObject
928         family_friendly = self._html_search_meta('isFamilyFriendly', html)
929
930         if not family_friendly:
931             return None
932
933         RATING_TABLE = {
934             '1': 0,
935             'true': 0,
936             '0': 18,
937             'false': 18,
938         }
939         return RATING_TABLE.get(family_friendly.lower())
940
941     def _twitter_search_player(self, html):
942         return self._html_search_meta('twitter:player', html,
943                                       'twitter card player')
944
945     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
946         json_ld = self._search_regex(
947             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
948             html, 'JSON-LD', group='json_ld', **kwargs)
949         default = kwargs.get('default', NO_DEFAULT)
950         if not json_ld:
951             return default if default is not NO_DEFAULT else {}
952         # JSON-LD may be malformed and thus `fatal` should be respected.
953         # At the same time `default` may be passed that assumes `fatal=False`
954         # for _search_regex. Let's simulate the same behavior here as well.
955         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
956         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
957
958     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
959         if isinstance(json_ld, compat_str):
960             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
961         if not json_ld:
962             return {}
963         info = {}
964         if not isinstance(json_ld, (list, tuple, dict)):
965             return info
966         if isinstance(json_ld, dict):
967             json_ld = [json_ld]
968         for e in json_ld:
969             if e.get('@context') == 'http://schema.org':
970                 item_type = e.get('@type')
971                 if expected_type is not None and expected_type != item_type:
972                     return info
973                 if item_type == 'TVEpisode':
974                     info.update({
975                         'episode': unescapeHTML(e.get('name')),
976                         'episode_number': int_or_none(e.get('episodeNumber')),
977                         'description': unescapeHTML(e.get('description')),
978                     })
979                     part_of_season = e.get('partOfSeason')
980                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
981                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
982                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
983                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
984                         info['series'] = unescapeHTML(part_of_series.get('name'))
985                 elif item_type == 'Article':
986                     info.update({
987                         'timestamp': parse_iso8601(e.get('datePublished')),
988                         'title': unescapeHTML(e.get('headline')),
989                         'description': unescapeHTML(e.get('articleBody')),
990                     })
991                 elif item_type == 'VideoObject':
992                     info.update({
993                         'url': e.get('contentUrl'),
994                         'title': unescapeHTML(e.get('name')),
995                         'description': unescapeHTML(e.get('description')),
996                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
997                         'duration': parse_duration(e.get('duration')),
998                         'timestamp': unified_timestamp(e.get('uploadDate')),
999                         'filesize': float_or_none(e.get('contentSize')),
1000                         'tbr': int_or_none(e.get('bitrate')),
1001                         'width': int_or_none(e.get('width')),
1002                         'height': int_or_none(e.get('height')),
1003                     })
1004                 break
1005         return dict((k, v) for k, v in info.items() if v is not None)
1006
1007     @staticmethod
1008     def _hidden_inputs(html):
1009         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1010         hidden_inputs = {}
1011         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1012             attrs = extract_attributes(input)
1013             if not input:
1014                 continue
1015             if attrs.get('type') not in ('hidden', 'submit'):
1016                 continue
1017             name = attrs.get('name') or attrs.get('id')
1018             value = attrs.get('value')
1019             if name and value is not None:
1020                 hidden_inputs[name] = value
1021         return hidden_inputs
1022
1023     def _form_hidden_inputs(self, form_id, html):
1024         form = self._search_regex(
1025             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1026             html, '%s form' % form_id, group='form')
1027         return self._hidden_inputs(form)
1028
1029     def _sort_formats(self, formats, field_preference=None):
1030         if not formats:
1031             raise ExtractorError('No video formats found')
1032
1033         for f in formats:
1034             # Automatically determine tbr when missing based on abr and vbr (improves
1035             # formats sorting in some cases)
1036             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1037                 f['tbr'] = f['abr'] + f['vbr']
1038
1039         def _formats_key(f):
1040             # TODO remove the following workaround
1041             from ..utils import determine_ext
1042             if not f.get('ext') and 'url' in f:
1043                 f['ext'] = determine_ext(f['url'])
1044
1045             if isinstance(field_preference, (list, tuple)):
1046                 return tuple(
1047                     f.get(field)
1048                     if f.get(field) is not None
1049                     else ('' if field == 'format_id' else -1)
1050                     for field in field_preference)
1051
1052             preference = f.get('preference')
1053             if preference is None:
1054                 preference = 0
1055                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1056                     preference -= 0.5
1057
1058             protocol = f.get('protocol') or determine_protocol(f)
1059             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1060
1061             if f.get('vcodec') == 'none':  # audio only
1062                 preference -= 50
1063                 if self._downloader.params.get('prefer_free_formats'):
1064                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1065                 else:
1066                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1067                 ext_preference = 0
1068                 try:
1069                     audio_ext_preference = ORDER.index(f['ext'])
1070                 except ValueError:
1071                     audio_ext_preference = -1
1072             else:
1073                 if f.get('acodec') == 'none':  # video only
1074                     preference -= 40
1075                 if self._downloader.params.get('prefer_free_formats'):
1076                     ORDER = ['flv', 'mp4', 'webm']
1077                 else:
1078                     ORDER = ['webm', 'flv', 'mp4']
1079                 try:
1080                     ext_preference = ORDER.index(f['ext'])
1081                 except ValueError:
1082                     ext_preference = -1
1083                 audio_ext_preference = 0
1084
1085             return (
1086                 preference,
1087                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1088                 f.get('quality') if f.get('quality') is not None else -1,
1089                 f.get('tbr') if f.get('tbr') is not None else -1,
1090                 f.get('filesize') if f.get('filesize') is not None else -1,
1091                 f.get('vbr') if f.get('vbr') is not None else -1,
1092                 f.get('height') if f.get('height') is not None else -1,
1093                 f.get('width') if f.get('width') is not None else -1,
1094                 proto_preference,
1095                 ext_preference,
1096                 f.get('abr') if f.get('abr') is not None else -1,
1097                 audio_ext_preference,
1098                 f.get('fps') if f.get('fps') is not None else -1,
1099                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1100                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1101                 f.get('format_id') if f.get('format_id') is not None else '',
1102             )
1103         formats.sort(key=_formats_key)
1104
1105     def _check_formats(self, formats, video_id):
1106         if formats:
1107             formats[:] = filter(
1108                 lambda f: self._is_valid_url(
1109                     f['url'], video_id,
1110                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1111                 formats)
1112
1113     @staticmethod
1114     def _remove_duplicate_formats(formats):
1115         format_urls = set()
1116         unique_formats = []
1117         for f in formats:
1118             if f['url'] not in format_urls:
1119                 format_urls.add(f['url'])
1120                 unique_formats.append(f)
1121         formats[:] = unique_formats
1122
1123     def _is_valid_url(self, url, video_id, item='video', headers={}):
1124         url = self._proto_relative_url(url, scheme='http:')
1125         # For now assume non HTTP(S) URLs always valid
1126         if not (url.startswith('http://') or url.startswith('https://')):
1127             return True
1128         try:
1129             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1130             return True
1131         except ExtractorError as e:
1132             if isinstance(e.cause, compat_urllib_error.URLError):
1133                 self.to_screen(
1134                     '%s: %s URL is invalid, skipping' % (video_id, item))
1135                 return False
1136             raise
1137
1138     def http_scheme(self):
1139         """ Either "http:" or "https:", depending on the user's preferences """
1140         return (
1141             'http:'
1142             if self._downloader.params.get('prefer_insecure', False)
1143             else 'https:')
1144
1145     def _proto_relative_url(self, url, scheme=None):
1146         if url is None:
1147             return url
1148         if url.startswith('//'):
1149             if scheme is None:
1150                 scheme = self.http_scheme()
1151             return scheme + url
1152         else:
1153             return url
1154
1155     def _sleep(self, timeout, video_id, msg_template=None):
1156         if msg_template is None:
1157             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1158         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1159         self.to_screen(msg)
1160         time.sleep(timeout)
1161
1162     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1163                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1164                              fatal=True, m3u8_id=None):
1165         manifest = self._download_xml(
1166             manifest_url, video_id, 'Downloading f4m manifest',
1167             'Unable to download f4m manifest',
1168             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1169             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1170             transform_source=transform_source,
1171             fatal=fatal)
1172
1173         if manifest is False:
1174             return []
1175
1176         return self._parse_f4m_formats(
1177             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1178             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1179
1180     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1181                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1182                            fatal=True, m3u8_id=None):
1183         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1184         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1185         if akamai_pv is not None and ';' in akamai_pv.text:
1186             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1187             if playerVerificationChallenge.strip() != '':
1188                 return []
1189
1190         formats = []
1191         manifest_version = '1.0'
1192         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1193         if not media_nodes:
1194             manifest_version = '2.0'
1195             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1196         # Remove unsupported DRM protected media from final formats
1197         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1198         media_nodes = remove_encrypted_media(media_nodes)
1199         if not media_nodes:
1200             return formats
1201         base_url = xpath_text(
1202             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1203             'base URL', default=None)
1204         if base_url:
1205             base_url = base_url.strip()
1206
1207         bootstrap_info = xpath_element(
1208             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1209             'bootstrap info', default=None)
1210
1211         vcodec = None
1212         mime_type = xpath_text(
1213             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1214             'base URL', default=None)
1215         if mime_type and mime_type.startswith('audio/'):
1216             vcodec = 'none'
1217
1218         for i, media_el in enumerate(media_nodes):
1219             tbr = int_or_none(media_el.attrib.get('bitrate'))
1220             width = int_or_none(media_el.attrib.get('width'))
1221             height = int_or_none(media_el.attrib.get('height'))
1222             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1223             # If <bootstrapInfo> is present, the specified f4m is a
1224             # stream-level manifest, and only set-level manifests may refer to
1225             # external resources.  See section 11.4 and section 4 of F4M spec
1226             if bootstrap_info is None:
1227                 media_url = None
1228                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1229                 if manifest_version == '2.0':
1230                     media_url = media_el.attrib.get('href')
1231                 if media_url is None:
1232                     media_url = media_el.attrib.get('url')
1233                 if not media_url:
1234                     continue
1235                 manifest_url = (
1236                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1237                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1238                 # If media_url is itself a f4m manifest do the recursive extraction
1239                 # since bitrates in parent manifest (this one) and media_url manifest
1240                 # may differ leading to inability to resolve the format by requested
1241                 # bitrate in f4m downloader
1242                 ext = determine_ext(manifest_url)
1243                 if ext == 'f4m':
1244                     f4m_formats = self._extract_f4m_formats(
1245                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1246                         transform_source=transform_source, fatal=fatal)
1247                     # Sometimes stream-level manifest contains single media entry that
1248                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1249                     # At the same time parent's media entry in set-level manifest may
1250                     # contain it. We will copy it from parent in such cases.
1251                     if len(f4m_formats) == 1:
1252                         f = f4m_formats[0]
1253                         f.update({
1254                             'tbr': f.get('tbr') or tbr,
1255                             'width': f.get('width') or width,
1256                             'height': f.get('height') or height,
1257                             'format_id': f.get('format_id') if not tbr else format_id,
1258                             'vcodec': vcodec,
1259                         })
1260                     formats.extend(f4m_formats)
1261                     continue
1262                 elif ext == 'm3u8':
1263                     formats.extend(self._extract_m3u8_formats(
1264                         manifest_url, video_id, 'mp4', preference=preference,
1265                         m3u8_id=m3u8_id, fatal=fatal))
1266                     continue
1267             formats.append({
1268                 'format_id': format_id,
1269                 'url': manifest_url,
1270                 'manifest_url': manifest_url,
1271                 'ext': 'flv' if bootstrap_info is not None else None,
1272                 'tbr': tbr,
1273                 'width': width,
1274                 'height': height,
1275                 'vcodec': vcodec,
1276                 'preference': preference,
1277             })
1278         return formats
1279
1280     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1281         return {
1282             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1283             'url': m3u8_url,
1284             'ext': ext,
1285             'protocol': 'm3u8',
1286             'preference': preference - 100 if preference else -100,
1287             'resolution': 'multiple',
1288             'format_note': 'Quality selection URL',
1289         }
1290
1291     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1292                               entry_protocol='m3u8', preference=None,
1293                               m3u8_id=None, note=None, errnote=None,
1294                               fatal=True, live=False):
1295
1296         res = self._download_webpage_handle(
1297             m3u8_url, video_id,
1298             note=note or 'Downloading m3u8 information',
1299             errnote=errnote or 'Failed to download m3u8 information',
1300             fatal=fatal)
1301         if res is False:
1302             return []
1303         m3u8_doc, urlh = res
1304         m3u8_url = urlh.geturl()
1305
1306         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1307             return []
1308
1309         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1310
1311         format_url = lambda u: (
1312             u
1313             if re.match(r'^https?://', u)
1314             else compat_urlparse.urljoin(m3u8_url, u))
1315
1316         # We should try extracting formats only from master playlists [1], i.e.
1317         # playlists that describe available qualities. On the other hand media
1318         # playlists [2] should be returned as is since they contain just the media
1319         # without qualities renditions.
1320         # Fortunately, master playlist can be easily distinguished from media
1321         # playlist based on particular tags availability. As of [1, 2] master
1322         # playlist tags MUST NOT appear in a media playist and vice versa.
1323         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1324         # and MUST NOT appear in master playlist thus we can clearly detect media
1325         # playlist with this criterion.
1326         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1327         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1328         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1329         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1330             return [{
1331                 'url': m3u8_url,
1332                 'format_id': m3u8_id,
1333                 'ext': ext,
1334                 'protocol': entry_protocol,
1335                 'preference': preference,
1336             }]
1337         audio_in_video_stream = {}
1338         last_info = {}
1339         last_media = {}
1340         for line in m3u8_doc.splitlines():
1341             if line.startswith('#EXT-X-STREAM-INF:'):
1342                 last_info = parse_m3u8_attributes(line)
1343             elif line.startswith('#EXT-X-MEDIA:'):
1344                 media = parse_m3u8_attributes(line)
1345                 media_type = media.get('TYPE')
1346                 if media_type in ('VIDEO', 'AUDIO'):
1347                     group_id = media.get('GROUP-ID')
1348                     media_url = media.get('URI')
1349                     if media_url:
1350                         format_id = []
1351                         for v in (group_id, media.get('NAME')):
1352                             if v:
1353                                 format_id.append(v)
1354                         f = {
1355                             'format_id': '-'.join(format_id),
1356                             'url': format_url(media_url),
1357                             'language': media.get('LANGUAGE'),
1358                             'ext': ext,
1359                             'protocol': entry_protocol,
1360                             'preference': preference,
1361                         }
1362                         if media_type == 'AUDIO':
1363                             f['vcodec'] = 'none'
1364                             if group_id and not audio_in_video_stream.get(group_id):
1365                                 audio_in_video_stream[group_id] = False
1366                         formats.append(f)
1367                     else:
1368                         # When there is no URI in EXT-X-MEDIA let this tag's
1369                         # data be used by regular URI lines below
1370                         last_media = media
1371                         if media_type == 'AUDIO' and group_id:
1372                             audio_in_video_stream[group_id] = True
1373             elif line.startswith('#') or not line.strip():
1374                 continue
1375             else:
1376                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1377                 format_id = []
1378                 if m3u8_id:
1379                     format_id.append(m3u8_id)
1380                 # Despite specification does not mention NAME attribute for
1381                 # EXT-X-STREAM-INF it still sometimes may be present
1382                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1383                 # Bandwidth of live streams may differ over time thus making
1384                 # format_id unpredictable. So it's better to keep provided
1385                 # format_id intact.
1386                 if not live:
1387                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1388                 manifest_url = format_url(line.strip())
1389                 f = {
1390                     'format_id': '-'.join(format_id),
1391                     'url': manifest_url,
1392                     'manifest_url': manifest_url,
1393                     'tbr': tbr,
1394                     'ext': ext,
1395                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1396                     'protocol': entry_protocol,
1397                     'preference': preference,
1398                 }
1399                 resolution = last_info.get('RESOLUTION')
1400                 if resolution:
1401                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1402                     if mobj:
1403                         f['width'] = int(mobj.group('width'))
1404                         f['height'] = int(mobj.group('height'))
1405                 # Unified Streaming Platform
1406                 mobj = re.search(
1407                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1408                 if mobj:
1409                     abr, vbr = mobj.groups()
1410                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1411                     f.update({
1412                         'vbr': vbr,
1413                         'abr': abr,
1414                     })
1415                 f.update(parse_codecs(last_info.get('CODECS')))
1416                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1417                     # TODO: update acodec for audio only formats with the same GROUP-ID
1418                     f['acodec'] = 'none'
1419                 formats.append(f)
1420                 last_info = {}
1421                 last_media = {}
1422         return formats
1423
1424     @staticmethod
1425     def _xpath_ns(path, namespace=None):
1426         if not namespace:
1427             return path
1428         out = []
1429         for c in path.split('/'):
1430             if not c or c == '.':
1431                 out.append(c)
1432             else:
1433                 out.append('{%s}%s' % (namespace, c))
1434         return '/'.join(out)
1435
1436     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1437         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1438
1439         if smil is False:
1440             assert not fatal
1441             return []
1442
1443         namespace = self._parse_smil_namespace(smil)
1444
1445         return self._parse_smil_formats(
1446             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1447
1448     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1449         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1450         if smil is False:
1451             return {}
1452         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1453
1454     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1455         return self._download_xml(
1456             smil_url, video_id, 'Downloading SMIL file',
1457             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1458
1459     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1460         namespace = self._parse_smil_namespace(smil)
1461
1462         formats = self._parse_smil_formats(
1463             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1464         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1465
1466         video_id = os.path.splitext(url_basename(smil_url))[0]
1467         title = None
1468         description = None
1469         upload_date = None
1470         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1471             name = meta.attrib.get('name')
1472             content = meta.attrib.get('content')
1473             if not name or not content:
1474                 continue
1475             if not title and name == 'title':
1476                 title = content
1477             elif not description and name in ('description', 'abstract'):
1478                 description = content
1479             elif not upload_date and name == 'date':
1480                 upload_date = unified_strdate(content)
1481
1482         thumbnails = [{
1483             'id': image.get('type'),
1484             'url': image.get('src'),
1485             'width': int_or_none(image.get('width')),
1486             'height': int_or_none(image.get('height')),
1487         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1488
1489         return {
1490             'id': video_id,
1491             'title': title or video_id,
1492             'description': description,
1493             'upload_date': upload_date,
1494             'thumbnails': thumbnails,
1495             'formats': formats,
1496             'subtitles': subtitles,
1497         }
1498
1499     def _parse_smil_namespace(self, smil):
1500         return self._search_regex(
1501             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1502
1503     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1504         base = smil_url
1505         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1506             b = meta.get('base') or meta.get('httpBase')
1507             if b:
1508                 base = b
1509                 break
1510
1511         formats = []
1512         rtmp_count = 0
1513         http_count = 0
1514         m3u8_count = 0
1515
1516         srcs = []
1517         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1518         for medium in media:
1519             src = medium.get('src')
1520             if not src or src in srcs:
1521                 continue
1522             srcs.append(src)
1523
1524             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1525             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1526             width = int_or_none(medium.get('width'))
1527             height = int_or_none(medium.get('height'))
1528             proto = medium.get('proto')
1529             ext = medium.get('ext')
1530             src_ext = determine_ext(src)
1531             streamer = medium.get('streamer') or base
1532
1533             if proto == 'rtmp' or streamer.startswith('rtmp'):
1534                 rtmp_count += 1
1535                 formats.append({
1536                     'url': streamer,
1537                     'play_path': src,
1538                     'ext': 'flv',
1539                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1540                     'tbr': bitrate,
1541                     'filesize': filesize,
1542                     'width': width,
1543                     'height': height,
1544                 })
1545                 if transform_rtmp_url:
1546                     streamer, src = transform_rtmp_url(streamer, src)
1547                     formats[-1].update({
1548                         'url': streamer,
1549                         'play_path': src,
1550                     })
1551                 continue
1552
1553             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1554             src_url = src_url.strip()
1555
1556             if proto == 'm3u8' or src_ext == 'm3u8':
1557                 m3u8_formats = self._extract_m3u8_formats(
1558                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1559                 if len(m3u8_formats) == 1:
1560                     m3u8_count += 1
1561                     m3u8_formats[0].update({
1562                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1563                         'tbr': bitrate,
1564                         'width': width,
1565                         'height': height,
1566                     })
1567                 formats.extend(m3u8_formats)
1568                 continue
1569
1570             if src_ext == 'f4m':
1571                 f4m_url = src_url
1572                 if not f4m_params:
1573                     f4m_params = {
1574                         'hdcore': '3.2.0',
1575                         'plugin': 'flowplayer-3.2.0.1',
1576                     }
1577                 f4m_url += '&' if '?' in f4m_url else '?'
1578                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1579                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1580                 continue
1581
1582             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1583                 http_count += 1
1584                 formats.append({
1585                     'url': src_url,
1586                     'ext': ext or src_ext or 'flv',
1587                     'format_id': 'http-%d' % (bitrate or http_count),
1588                     'tbr': bitrate,
1589                     'filesize': filesize,
1590                     'width': width,
1591                     'height': height,
1592                 })
1593                 continue
1594
1595         return formats
1596
1597     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1598         urls = []
1599         subtitles = {}
1600         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1601             src = textstream.get('src')
1602             if not src or src in urls:
1603                 continue
1604             urls.append(src)
1605             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1606             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1607             subtitles.setdefault(lang, []).append({
1608                 'url': src,
1609                 'ext': ext,
1610             })
1611         return subtitles
1612
1613     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1614         xspf = self._download_xml(
1615             playlist_url, playlist_id, 'Downloading xpsf playlist',
1616             'Unable to download xspf manifest', fatal=fatal)
1617         if xspf is False:
1618             return []
1619         return self._parse_xspf(xspf, playlist_id)
1620
1621     def _parse_xspf(self, playlist, playlist_id):
1622         NS_MAP = {
1623             'xspf': 'http://xspf.org/ns/0/',
1624             's1': 'http://static.streamone.nl/player/ns/0',
1625         }
1626
1627         entries = []
1628         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1629             title = xpath_text(
1630                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1631             description = xpath_text(
1632                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1633             thumbnail = xpath_text(
1634                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1635             duration = float_or_none(
1636                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1637
1638             formats = [{
1639                 'url': location.text,
1640                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1641                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1642                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1643             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1644             self._sort_formats(formats)
1645
1646             entries.append({
1647                 'id': playlist_id,
1648                 'title': title,
1649                 'description': description,
1650                 'thumbnail': thumbnail,
1651                 'duration': duration,
1652                 'formats': formats,
1653             })
1654         return entries
1655
1656     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1657         res = self._download_webpage_handle(
1658             mpd_url, video_id,
1659             note=note or 'Downloading MPD manifest',
1660             errnote=errnote or 'Failed to download MPD manifest',
1661             fatal=fatal)
1662         if res is False:
1663             return []
1664         mpd, urlh = res
1665         mpd_base_url = base_url(urlh.geturl())
1666
1667         return self._parse_mpd_formats(
1668             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1669             formats_dict=formats_dict, mpd_url=mpd_url)
1670
1671     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1672         """
1673         Parse formats from MPD manifest.
1674         References:
1675          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1676             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1677          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1678         """
1679         if mpd_doc.get('type') == 'dynamic':
1680             return []
1681
1682         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1683
1684         def _add_ns(path):
1685             return self._xpath_ns(path, namespace)
1686
1687         def is_drm_protected(element):
1688             return element.find(_add_ns('ContentProtection')) is not None
1689
1690         def extract_multisegment_info(element, ms_parent_info):
1691             ms_info = ms_parent_info.copy()
1692
1693             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1694             # common attributes and elements.  We will only extract relevant
1695             # for us.
1696             def extract_common(source):
1697                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1698                 if segment_timeline is not None:
1699                     s_e = segment_timeline.findall(_add_ns('S'))
1700                     if s_e:
1701                         ms_info['total_number'] = 0
1702                         ms_info['s'] = []
1703                         for s in s_e:
1704                             r = int(s.get('r', 0))
1705                             ms_info['total_number'] += 1 + r
1706                             ms_info['s'].append({
1707                                 't': int(s.get('t', 0)),
1708                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1709                                 'd': int(s.attrib['d']),
1710                                 'r': r,
1711                             })
1712                 start_number = source.get('startNumber')
1713                 if start_number:
1714                     ms_info['start_number'] = int(start_number)
1715                 timescale = source.get('timescale')
1716                 if timescale:
1717                     ms_info['timescale'] = int(timescale)
1718                 segment_duration = source.get('duration')
1719                 if segment_duration:
1720                     ms_info['segment_duration'] = int(segment_duration)
1721
1722             def extract_Initialization(source):
1723                 initialization = source.find(_add_ns('Initialization'))
1724                 if initialization is not None:
1725                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1726
1727             segment_list = element.find(_add_ns('SegmentList'))
1728             if segment_list is not None:
1729                 extract_common(segment_list)
1730                 extract_Initialization(segment_list)
1731                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1732                 if segment_urls_e:
1733                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1734             else:
1735                 segment_template = element.find(_add_ns('SegmentTemplate'))
1736                 if segment_template is not None:
1737                     extract_common(segment_template)
1738                     media = segment_template.get('media')
1739                     if media:
1740                         ms_info['media'] = media
1741                     initialization = segment_template.get('initialization')
1742                     if initialization:
1743                         ms_info['initialization'] = initialization
1744                     else:
1745                         extract_Initialization(segment_template)
1746             return ms_info
1747
1748         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1749         formats = []
1750         for period in mpd_doc.findall(_add_ns('Period')):
1751             period_duration = parse_duration(period.get('duration')) or mpd_duration
1752             period_ms_info = extract_multisegment_info(period, {
1753                 'start_number': 1,
1754                 'timescale': 1,
1755             })
1756             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1757                 if is_drm_protected(adaptation_set):
1758                     continue
1759                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1760                 for representation in adaptation_set.findall(_add_ns('Representation')):
1761                     if is_drm_protected(representation):
1762                         continue
1763                     representation_attrib = adaptation_set.attrib.copy()
1764                     representation_attrib.update(representation.attrib)
1765                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1766                     mime_type = representation_attrib['mimeType']
1767                     content_type = mime_type.split('/')[0]
1768                     if content_type == 'text':
1769                         # TODO implement WebVTT downloading
1770                         pass
1771                     elif content_type == 'video' or content_type == 'audio':
1772                         base_url = ''
1773                         for element in (representation, adaptation_set, period, mpd_doc):
1774                             base_url_e = element.find(_add_ns('BaseURL'))
1775                             if base_url_e is not None:
1776                                 base_url = base_url_e.text + base_url
1777                                 if re.match(r'^https?://', base_url):
1778                                     break
1779                         if mpd_base_url and not re.match(r'^https?://', base_url):
1780                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1781                                 mpd_base_url += '/'
1782                             base_url = mpd_base_url + base_url
1783                         representation_id = representation_attrib.get('id')
1784                         lang = representation_attrib.get('lang')
1785                         url_el = representation.find(_add_ns('BaseURL'))
1786                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1787                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1788                         f = {
1789                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1790                             'url': base_url,
1791                             'manifest_url': mpd_url,
1792                             'ext': mimetype2ext(mime_type),
1793                             'width': int_or_none(representation_attrib.get('width')),
1794                             'height': int_or_none(representation_attrib.get('height')),
1795                             'tbr': int_or_none(bandwidth, 1000),
1796                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1797                             'fps': int_or_none(representation_attrib.get('frameRate')),
1798                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1799                             'format_note': 'DASH %s' % content_type,
1800                             'filesize': filesize,
1801                         }
1802                         f.update(parse_codecs(representation_attrib.get('codecs')))
1803                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1804
1805                         def prepare_template(template_name, identifiers):
1806                             t = representation_ms_info[template_name]
1807                             t = t.replace('$RepresentationID$', representation_id)
1808                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1809                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1810                             t.replace('$$', '$')
1811                             return t
1812
1813                         # @initialization is a regular template like @media one
1814                         # so it should be handled just the same way (see
1815                         # https://github.com/rg3/youtube-dl/issues/11605)
1816                         if 'initialization' in representation_ms_info:
1817                             initialization_template = prepare_template(
1818                                 'initialization',
1819                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1820                                 # $Time$ shall not be included for @initialization thus
1821                                 # only $Bandwidth$ remains
1822                                 ('Bandwidth', ))
1823                             representation_ms_info['initialization_url'] = initialization_template % {
1824                                 'Bandwidth': bandwidth,
1825                             }
1826
1827                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1828
1829                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1830
1831                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1832                             # can't be used at the same time
1833                             if '%(Number' in media_template and 's' not in representation_ms_info:
1834                                 segment_duration = None
1835                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1836                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1837                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1838                                 representation_ms_info['fragments'] = [{
1839                                     'url': media_template % {
1840                                         'Number': segment_number,
1841                                         'Bandwidth': bandwidth,
1842                                     },
1843                                     'duration': segment_duration,
1844                                 } for segment_number in range(
1845                                     representation_ms_info['start_number'],
1846                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1847                             else:
1848                                 # $Number*$ or $Time$ in media template with S list available
1849                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1850                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1851                                 representation_ms_info['fragments'] = []
1852                                 segment_time = 0
1853                                 segment_d = None
1854                                 segment_number = representation_ms_info['start_number']
1855
1856                                 def add_segment_url():
1857                                     segment_url = media_template % {
1858                                         'Time': segment_time,
1859                                         'Bandwidth': bandwidth,
1860                                         'Number': segment_number,
1861                                     }
1862                                     representation_ms_info['fragments'].append({
1863                                         'url': segment_url,
1864                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1865                                     })
1866
1867                                 for num, s in enumerate(representation_ms_info['s']):
1868                                     segment_time = s.get('t') or segment_time
1869                                     segment_d = s['d']
1870                                     add_segment_url()
1871                                     segment_number += 1
1872                                     for r in range(s.get('r', 0)):
1873                                         segment_time += segment_d
1874                                         add_segment_url()
1875                                         segment_number += 1
1876                                     segment_time += segment_d
1877                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1878                             # No media template
1879                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1880                             # or any YouTube dashsegments video
1881                             fragments = []
1882                             segment_index = 0
1883                             timescale = representation_ms_info['timescale']
1884                             for s in representation_ms_info['s']:
1885                                 duration = float_or_none(s['d'], timescale)
1886                                 for r in range(s.get('r', 0) + 1):
1887                                     fragments.append({
1888                                         'url': representation_ms_info['segment_urls'][segment_index],
1889                                         'duration': duration,
1890                                     })
1891                                     segment_index += 1
1892                             representation_ms_info['fragments'] = fragments
1893                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1894                         # No fragments key is present in this case.
1895                         if 'fragments' in representation_ms_info:
1896                             f.update({
1897                                 'fragments': [],
1898                                 'protocol': 'http_dash_segments',
1899                             })
1900                             if 'initialization_url' in representation_ms_info:
1901                                 initialization_url = representation_ms_info['initialization_url']
1902                                 if not f.get('url'):
1903                                     f['url'] = initialization_url
1904                                 f['fragments'].append({'url': initialization_url})
1905                             f['fragments'].extend(representation_ms_info['fragments'])
1906                             for fragment in f['fragments']:
1907                                 fragment['url'] = urljoin(base_url, fragment['url'])
1908                         try:
1909                             existing_format = next(
1910                                 fo for fo in formats
1911                                 if fo['format_id'] == representation_id)
1912                         except StopIteration:
1913                             full_info = formats_dict.get(representation_id, {}).copy()
1914                             full_info.update(f)
1915                             formats.append(full_info)
1916                         else:
1917                             existing_format.update(f)
1918                     else:
1919                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1920         return formats
1921
1922     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1923         res = self._download_webpage_handle(
1924             ism_url, video_id,
1925             note=note or 'Downloading ISM manifest',
1926             errnote=errnote or 'Failed to download ISM manifest',
1927             fatal=fatal)
1928         if res is False:
1929             return []
1930         ism, urlh = res
1931
1932         return self._parse_ism_formats(
1933             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1934
1935     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1936         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1937             return []
1938
1939         duration = int(ism_doc.attrib['Duration'])
1940         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1941
1942         formats = []
1943         for stream in ism_doc.findall('StreamIndex'):
1944             stream_type = stream.get('Type')
1945             if stream_type not in ('video', 'audio'):
1946                 continue
1947             url_pattern = stream.attrib['Url']
1948             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1949             stream_name = stream.get('Name')
1950             for track in stream.findall('QualityLevel'):
1951                 fourcc = track.get('FourCC')
1952                 # TODO: add support for WVC1 and WMAP
1953                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1954                     self.report_warning('%s is not a supported codec' % fourcc)
1955                     continue
1956                 tbr = int(track.attrib['Bitrate']) // 1000
1957                 width = int_or_none(track.get('MaxWidth'))
1958                 height = int_or_none(track.get('MaxHeight'))
1959                 sampling_rate = int_or_none(track.get('SamplingRate'))
1960
1961                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1962                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1963
1964                 fragments = []
1965                 fragment_ctx = {
1966                     'time': 0,
1967                 }
1968                 stream_fragments = stream.findall('c')
1969                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1970                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1971                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1972                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1973                     if not fragment_ctx['duration']:
1974                         try:
1975                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1976                         except IndexError:
1977                             next_fragment_time = duration
1978                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1979                     for _ in range(fragment_repeat):
1980                         fragments.append({
1981                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1982                             'duration': fragment_ctx['duration'] / stream_timescale,
1983                         })
1984                         fragment_ctx['time'] += fragment_ctx['duration']
1985
1986                 format_id = []
1987                 if ism_id:
1988                     format_id.append(ism_id)
1989                 if stream_name:
1990                     format_id.append(stream_name)
1991                 format_id.append(compat_str(tbr))
1992
1993                 formats.append({
1994                     'format_id': '-'.join(format_id),
1995                     'url': ism_url,
1996                     'manifest_url': ism_url,
1997                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1998                     'width': width,
1999                     'height': height,
2000                     'tbr': tbr,
2001                     'asr': sampling_rate,
2002                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2003                     'acodec': 'none' if stream_type == 'video' else fourcc,
2004                     'protocol': 'ism',
2005                     'fragments': fragments,
2006                     '_download_params': {
2007                         'duration': duration,
2008                         'timescale': stream_timescale,
2009                         'width': width or 0,
2010                         'height': height or 0,
2011                         'fourcc': fourcc,
2012                         'codec_private_data': track.get('CodecPrivateData'),
2013                         'sampling_rate': sampling_rate,
2014                         'channels': int_or_none(track.get('Channels', 2)),
2015                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2016                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2017                     },
2018                 })
2019         return formats
2020
2021     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2022         def absolute_url(video_url):
2023             return compat_urlparse.urljoin(base_url, video_url)
2024
2025         def parse_content_type(content_type):
2026             if not content_type:
2027                 return {}
2028             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2029             if ctr:
2030                 mimetype, codecs = ctr.groups()
2031                 f = parse_codecs(codecs)
2032                 f['ext'] = mimetype2ext(mimetype)
2033                 return f
2034             return {}
2035
2036         def _media_formats(src, cur_media_type):
2037             full_url = absolute_url(src)
2038             ext = determine_ext(full_url)
2039             if ext == 'm3u8':
2040                 is_plain_url = False
2041                 formats = self._extract_m3u8_formats(
2042                     full_url, video_id, ext='mp4',
2043                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2044                     preference=preference)
2045             elif ext == 'mpd':
2046                 is_plain_url = False
2047                 formats = self._extract_mpd_formats(
2048                     full_url, video_id, mpd_id=mpd_id)
2049             else:
2050                 is_plain_url = True
2051                 formats = [{
2052                     'url': full_url,
2053                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2054                 }]
2055             return is_plain_url, formats
2056
2057         entries = []
2058         media_tags = [(media_tag, media_type, '')
2059                       for media_tag, media_type
2060                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2061         media_tags.extend(re.findall(
2062             # We only allow video|audio followed by a whitespace or '>'.
2063             # Allowing more characters may end up in significant slow down (see
2064             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2065             # http://www.porntrex.com/maps/videositemap.xml).
2066             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2067         for media_tag, media_type, media_content in media_tags:
2068             media_info = {
2069                 'formats': [],
2070                 'subtitles': {},
2071             }
2072             media_attributes = extract_attributes(media_tag)
2073             src = media_attributes.get('src')
2074             if src:
2075                 _, formats = _media_formats(src, media_type)
2076                 media_info['formats'].extend(formats)
2077             media_info['thumbnail'] = media_attributes.get('poster')
2078             if media_content:
2079                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2080                     source_attributes = extract_attributes(source_tag)
2081                     src = source_attributes.get('src')
2082                     if not src:
2083                         continue
2084                     is_plain_url, formats = _media_formats(src, media_type)
2085                     if is_plain_url:
2086                         f = parse_content_type(source_attributes.get('type'))
2087                         f.update(formats[0])
2088                         media_info['formats'].append(f)
2089                     else:
2090                         media_info['formats'].extend(formats)
2091                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2092                     track_attributes = extract_attributes(track_tag)
2093                     kind = track_attributes.get('kind')
2094                     if not kind or kind in ('subtitles', 'captions'):
2095                         src = track_attributes.get('src')
2096                         if not src:
2097                             continue
2098                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2099                         media_info['subtitles'].setdefault(lang, []).append({
2100                             'url': absolute_url(src),
2101                         })
2102             if media_info['formats'] or media_info['subtitles']:
2103                 entries.append(media_info)
2104         return entries
2105
2106     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2107         formats = []
2108         hdcore_sign = 'hdcore=3.7.0'
2109         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2110         hds_host = hosts.get('hds')
2111         if hds_host:
2112             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2113         if 'hdcore=' not in f4m_url:
2114             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2115         f4m_formats = self._extract_f4m_formats(
2116             f4m_url, video_id, f4m_id='hds', fatal=False)
2117         for entry in f4m_formats:
2118             entry.update({'extra_param_to_segment_url': hdcore_sign})
2119         formats.extend(f4m_formats)
2120         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2121         hls_host = hosts.get('hls')
2122         if hls_host:
2123             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2124         formats.extend(self._extract_m3u8_formats(
2125             m3u8_url, video_id, 'mp4', 'm3u8_native',
2126             m3u8_id='hls', fatal=False))
2127         return formats
2128
2129     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2130         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2131         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2132         http_base_url = 'http' + url_base
2133         formats = []
2134         if 'm3u8' not in skip_protocols:
2135             formats.extend(self._extract_m3u8_formats(
2136                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2137                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2138         if 'f4m' not in skip_protocols:
2139             formats.extend(self._extract_f4m_formats(
2140                 http_base_url + '/manifest.f4m',
2141                 video_id, f4m_id='hds', fatal=False))
2142         if 'dash' not in skip_protocols:
2143             formats.extend(self._extract_mpd_formats(
2144                 http_base_url + '/manifest.mpd',
2145                 video_id, mpd_id='dash', fatal=False))
2146         if re.search(r'(?:/smil:|\.smil)', url_base):
2147             if 'smil' not in skip_protocols:
2148                 rtmp_formats = self._extract_smil_formats(
2149                     http_base_url + '/jwplayer.smil',
2150                     video_id, fatal=False)
2151                 for rtmp_format in rtmp_formats:
2152                     rtsp_format = rtmp_format.copy()
2153                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2154                     del rtsp_format['play_path']
2155                     del rtsp_format['ext']
2156                     rtsp_format.update({
2157                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2158                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2159                         'protocol': 'rtsp',
2160                     })
2161                     formats.extend([rtmp_format, rtsp_format])
2162         else:
2163             for protocol in ('rtmp', 'rtsp'):
2164                 if protocol not in skip_protocols:
2165                     formats.append({
2166                         'url': protocol + url_base,
2167                         'format_id': protocol,
2168                         'protocol': protocol,
2169                     })
2170         return formats
2171
2172     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2173         mobj = re.search(
2174             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2175             webpage)
2176         if mobj:
2177             try:
2178                 jwplayer_data = self._parse_json(mobj.group('options'),
2179                                                  video_id=video_id,
2180                                                  transform_source=transform_source)
2181             except ExtractorError:
2182                 pass
2183             else:
2184                 if isinstance(jwplayer_data, dict):
2185                     return jwplayer_data
2186
2187     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2188         jwplayer_data = self._find_jwplayer_data(
2189             webpage, video_id, transform_source=js_to_json)
2190         return self._parse_jwplayer_data(
2191             jwplayer_data, video_id, *args, **kwargs)
2192
2193     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2194                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2195         # JWPlayer backward compatibility: flattened playlists
2196         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2197         if 'playlist' not in jwplayer_data:
2198             jwplayer_data = {'playlist': [jwplayer_data]}
2199
2200         entries = []
2201
2202         # JWPlayer backward compatibility: single playlist item
2203         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2204         if not isinstance(jwplayer_data['playlist'], list):
2205             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2206
2207         for video_data in jwplayer_data['playlist']:
2208             # JWPlayer backward compatibility: flattened sources
2209             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2210             if 'sources' not in video_data:
2211                 video_data['sources'] = [video_data]
2212
2213             this_video_id = video_id or video_data['mediaid']
2214
2215             formats = self._parse_jwplayer_formats(
2216                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2217                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2218             self._sort_formats(formats)
2219
2220             subtitles = {}
2221             tracks = video_data.get('tracks')
2222             if tracks and isinstance(tracks, list):
2223                 for track in tracks:
2224                     if track.get('kind') != 'captions':
2225                         continue
2226                     track_url = urljoin(base_url, track.get('file'))
2227                     if not track_url:
2228                         continue
2229                     subtitles.setdefault(track.get('label') or 'en', []).append({
2230                         'url': self._proto_relative_url(track_url)
2231                     })
2232
2233             entries.append({
2234                 'id': this_video_id,
2235                 'title': video_data['title'] if require_title else video_data.get('title'),
2236                 'description': video_data.get('description'),
2237                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2238                 'timestamp': int_or_none(video_data.get('pubdate')),
2239                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2240                 'subtitles': subtitles,
2241                 'formats': formats,
2242             })
2243         if len(entries) == 1:
2244             return entries[0]
2245         else:
2246             return self.playlist_result(entries)
2247
2248     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2249                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2250         formats = []
2251         for source in jwplayer_sources_data:
2252             source_url = self._proto_relative_url(source['file'])
2253             if base_url:
2254                 source_url = compat_urlparse.urljoin(base_url, source_url)
2255             source_type = source.get('type') or ''
2256             ext = mimetype2ext(source_type) or determine_ext(source_url)
2257             if source_type == 'hls' or ext == 'm3u8':
2258                 formats.extend(self._extract_m3u8_formats(
2259                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2260                     m3u8_id=m3u8_id, fatal=False))
2261             elif ext == 'mpd':
2262                 formats.extend(self._extract_mpd_formats(
2263                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2264             elif ext == 'smil':
2265                 formats.extend(self._extract_smil_formats(
2266                     source_url, video_id, fatal=False))
2267             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2268             elif source_type.startswith('audio') or ext in (
2269                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2270                 formats.append({
2271                     'url': source_url,
2272                     'vcodec': 'none',
2273                     'ext': ext,
2274                 })
2275             else:
2276                 height = int_or_none(source.get('height'))
2277                 if height is None:
2278                     # Often no height is provided but there is a label in
2279                     # format like "1080p", "720p SD", or 1080.
2280                     height = int_or_none(self._search_regex(
2281                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2282                         'height', default=None))
2283                 a_format = {
2284                     'url': source_url,
2285                     'width': int_or_none(source.get('width')),
2286                     'height': height,
2287                     'tbr': int_or_none(source.get('bitrate')),
2288                     'ext': ext,
2289                 }
2290                 if source_url.startswith('rtmp'):
2291                     a_format['ext'] = 'flv'
2292                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2293                     # of jwplayer.flash.swf
2294                     rtmp_url_parts = re.split(
2295                         r'((?:mp4|mp3|flv):)', source_url, 1)
2296                     if len(rtmp_url_parts) == 3:
2297                         rtmp_url, prefix, play_path = rtmp_url_parts
2298                         a_format.update({
2299                             'url': rtmp_url,
2300                             'play_path': prefix + play_path,
2301                         })
2302                     if rtmp_params:
2303                         a_format.update(rtmp_params)
2304                 formats.append(a_format)
2305         return formats
2306
2307     def _live_title(self, name):
2308         """ Generate the title for a live video """
2309         now = datetime.datetime.now()
2310         now_str = now.strftime('%Y-%m-%d %H:%M')
2311         return name + ' ' + now_str
2312
2313     def _int(self, v, name, fatal=False, **kwargs):
2314         res = int_or_none(v, **kwargs)
2315         if 'get_attr' in kwargs:
2316             print(getattr(v, kwargs['get_attr']))
2317         if res is None:
2318             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2319             if fatal:
2320                 raise ExtractorError(msg)
2321             else:
2322                 self._downloader.report_warning(msg)
2323         return res
2324
2325     def _float(self, v, name, fatal=False, **kwargs):
2326         res = float_or_none(v, **kwargs)
2327         if res is None:
2328             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2329             if fatal:
2330                 raise ExtractorError(msg)
2331             else:
2332                 self._downloader.report_warning(msg)
2333         return res
2334
2335     def _set_cookie(self, domain, name, value, expire_time=None):
2336         cookie = compat_cookiejar.Cookie(
2337             0, name, value, None, None, domain, None,
2338             None, '/', True, False, expire_time, '', None, None, None)
2339         self._downloader.cookiejar.set_cookie(cookie)
2340
2341     def _get_cookies(self, url):
2342         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2343         req = sanitized_Request(url)
2344         self._downloader.cookiejar.add_cookie_header(req)
2345         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2346
2347     def get_testcases(self, include_onlymatching=False):
2348         t = getattr(self, '_TEST', None)
2349         if t:
2350             assert not hasattr(self, '_TESTS'), \
2351                 '%s has _TEST and _TESTS' % type(self).__name__
2352             tests = [t]
2353         else:
2354             tests = getattr(self, '_TESTS', [])
2355         for t in tests:
2356             if not include_onlymatching and t.get('only_matching', False):
2357                 continue
2358             t['name'] = type(self).__name__[:-len('IE')]
2359             yield t
2360
2361     def is_suitable(self, age_limit):
2362         """ Test whether the extractor is generally suitable for the given
2363         age limit (i.e. pornographic sites are not, all others usually are) """
2364
2365         any_restricted = False
2366         for tc in self.get_testcases(include_onlymatching=False):
2367             if tc.get('playlist', []):
2368                 tc = tc['playlist'][0]
2369             is_restricted = age_restricted(
2370                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2371             if not is_restricted:
2372                 return True
2373             any_restricted = any_restricted or is_restricted
2374         return not any_restricted
2375
2376     def extract_subtitles(self, *args, **kwargs):
2377         if (self._downloader.params.get('writesubtitles', False) or
2378                 self._downloader.params.get('listsubtitles')):
2379             return self._get_subtitles(*args, **kwargs)
2380         return {}
2381
2382     def _get_subtitles(self, *args, **kwargs):
2383         raise NotImplementedError('This method must be implemented by subclasses')
2384
2385     @staticmethod
2386     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2387         """ Merge subtitle items for one language. Items with duplicated URLs
2388         will be dropped. """
2389         list1_urls = set([item['url'] for item in subtitle_list1])
2390         ret = list(subtitle_list1)
2391         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2392         return ret
2393
2394     @classmethod
2395     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2396         """ Merge two subtitle dictionaries, language by language. """
2397         ret = dict(subtitle_dict1)
2398         for lang in subtitle_dict2:
2399             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2400         return ret
2401
2402     def extract_automatic_captions(self, *args, **kwargs):
2403         if (self._downloader.params.get('writeautomaticsub', False) or
2404                 self._downloader.params.get('listsubtitles')):
2405             return self._get_automatic_captions(*args, **kwargs)
2406         return {}
2407
2408     def _get_automatic_captions(self, *args, **kwargs):
2409         raise NotImplementedError('This method must be implemented by subclasses')
2410
2411     def mark_watched(self, *args, **kwargs):
2412         if (self._downloader.params.get('mark_watched', False) and
2413                 (self._get_login_info()[0] is not None or
2414                     self._downloader.params.get('cookiefile') is not None)):
2415             self._mark_watched(*args, **kwargs)
2416
2417     def _mark_watched(self, *args, **kwargs):
2418         raise NotImplementedError('This method must be implemented by subclasses')
2419
2420     def geo_verification_headers(self):
2421         headers = {}
2422         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2423         if geo_verification_proxy:
2424             headers['Ytdl-request-proxy'] = geo_verification_proxy
2425         return headers
2426
2427     def _generic_id(self, url):
2428         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2429
2430     def _generic_title(self, url):
2431         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2432
2433
2434 class SearchInfoExtractor(InfoExtractor):
2435     """
2436     Base class for paged search queries extractors.
2437     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2438     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2439     """
2440
2441     @classmethod
2442     def _make_valid_url(cls):
2443         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2444
2445     @classmethod
2446     def suitable(cls, url):
2447         return re.match(cls._make_valid_url(), url) is not None
2448
2449     def _real_extract(self, query):
2450         mobj = re.match(self._make_valid_url(), query)
2451         if mobj is None:
2452             raise ExtractorError('Invalid search query "%s"' % query)
2453
2454         prefix = mobj.group('prefix')
2455         query = mobj.group('query')
2456         if prefix == '':
2457             return self._get_n_results(query, 1)
2458         elif prefix == 'all':
2459             return self._get_n_results(query, self._MAX_RESULTS)
2460         else:
2461             n = int(prefix)
2462             if n <= 0:
2463                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2464             elif n > self._MAX_RESULTS:
2465                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2466                 n = self._MAX_RESULTS
2467             return self._get_n_results(query, n)
2468
2469     def _get_n_results(self, query, n):
2470         """Get a specified number of results for a query"""
2471         raise NotImplementedError('This method must be implemented by subclasses')
2472
2473     @property
2474     def SEARCH_KEY(self):
2475         return self._SEARCH_KEY