[extractor/common] Extract interaction statistic
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_http_client,
23     compat_os_name,
24     compat_str,
25     compat_urllib_error,
26     compat_urllib_parse_unquote,
27     compat_urllib_parse_urlencode,
28     compat_urllib_request,
29     compat_urlparse,
30     compat_xml_parse_error,
31 )
32 from ..downloader.f4m import (
33     get_base_url,
34     remove_encrypted_media,
35 )
36 from ..utils import (
37     NO_DEFAULT,
38     age_restricted,
39     base_url,
40     bug_reports_message,
41     clean_html,
42     compiled_regex_type,
43     determine_ext,
44     determine_protocol,
45     error_to_compat_str,
46     ExtractorError,
47     extract_attributes,
48     fix_xml_ampersands,
49     float_or_none,
50     GeoRestrictedError,
51     GeoUtils,
52     int_or_none,
53     js_to_json,
54     mimetype2ext,
55     orderedSet,
56     parse_codecs,
57     parse_duration,
58     parse_iso8601,
59     parse_m3u8_attributes,
60     RegexNotFoundError,
61     sanitized_Request,
62     sanitize_filename,
63     unescapeHTML,
64     unified_strdate,
65     unified_timestamp,
66     update_Request,
67     update_url_query,
68     urljoin,
69     url_basename,
70     xpath_element,
71     xpath_text,
72     xpath_with_ns,
73 )
74
75
76 class InfoExtractor(object):
77     """Information Extractor class.
78
79     Information extractors are the classes that, given a URL, extract
80     information about the video (or videos) the URL refers to. This
81     information includes the real video URL, the video title, author and
82     others. The information is stored in a dictionary which is then
83     passed to the YoutubeDL. The YoutubeDL processes this
84     information possibly downloading the video to the file system, among
85     other possible outcomes.
86
87     The type field determines the type of the result.
88     By far the most common value (and the default if _type is missing) is
89     "video", which indicates a single video.
90
91     For a video, the dictionaries must include the following fields:
92
93     id:             Video identifier.
94     title:          Video title, unescaped.
95
96     Additionally, it must contain either a formats entry or a url one:
97
98     formats:        A list of dictionaries for each format available, ordered
99                     from worst to best quality.
100
101                     Potential fields:
102                     * url        Mandatory. The URL of the video file
103                     * manifest_url
104                                  The URL of the manifest file in case of
105                                  fragmented media (DASH, hls, hds)
106                     * ext        Will be calculated from URL if missing
107                     * format     A human-readable description of the format
108                                  ("mp4 container with h264/opus").
109                                  Calculated from the format_id, width, height.
110                                  and format_note fields if missing.
111                     * format_id  A short description of the format
112                                  ("mp4_h264_opus" or "19").
113                                 Technically optional, but strongly recommended.
114                     * format_note Additional info about the format
115                                  ("3D" or "DASH video")
116                     * width      Width of the video, if known
117                     * height     Height of the video, if known
118                     * resolution Textual description of width and height
119                     * tbr        Average bitrate of audio and video in KBit/s
120                     * abr        Average audio bitrate in KBit/s
121                     * acodec     Name of the audio codec in use
122                     * asr        Audio sampling rate in Hertz
123                     * vbr        Average video bitrate in KBit/s
124                     * fps        Frame rate
125                     * vcodec     Name of the video codec in use
126                     * container  Name of the container format
127                     * filesize   The number of bytes, if known in advance
128                     * filesize_approx  An estimate for the number of bytes
129                     * player_url SWF Player URL (used for rtmpdump).
130                     * protocol   The protocol that will be used for the actual
131                                  download, lower-case.
132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
133                                  "m3u8", "m3u8_native" or "http_dash_segments".
134                     * fragment_base_url
135                                  Base URL for fragments. Each fragment's path
136                                  value (if present) will be relative to
137                                  this URL.
138                     * fragments  A list of fragments of a fragmented media.
139                                  Each fragment entry must contain either an url
140                                  or a path. If an url is present it should be
141                                  considered by a client. Otherwise both path and
142                                  fragment_base_url must be present. Here is
143                                  the list of all potential fields:
144                                  * "url" - fragment's URL
145                                  * "path" - fragment's path relative to
146                                             fragment_base_url
147                                  * "duration" (optional, int or float)
148                                  * "filesize" (optional, int)
149                     * preference Order number of this format. If this field is
150                                  present and not None, the formats get sorted
151                                  by this field, regardless of all other values.
152                                  -1 for default (order by other properties),
153                                  -2 or smaller for less than default.
154                                  < -1000 to hide the format (if there is
155                                     another one which is strictly better)
156                     * language   Language code, e.g. "de" or "en-US".
157                     * language_preference  Is this in the language mentioned in
158                                  the URL?
159                                  10 if it's what the URL is about,
160                                  -1 for default (don't know),
161                                  -10 otherwise, other values reserved for now.
162                     * quality    Order number of the video quality of this
163                                  format, irrespective of the file format.
164                                  -1 for default (order by other properties),
165                                  -2 or smaller for less than default.
166                     * source_preference  Order number for this video source
167                                   (quality takes higher priority)
168                                  -1 for default (order by other properties),
169                                  -2 or smaller for less than default.
170                     * http_headers  A dictionary of additional HTTP headers
171                                  to add to the request.
172                     * stretched_ratio  If given and not 1, indicates that the
173                                  video's pixels are not square.
174                                  width : height ratio as float.
175                     * no_resume  The server does not support resuming the
176                                  (HTTP or RTMP) download. Boolean.
177                     * downloader_options  A dictionary of downloader options as
178                                  described in FileDownloader
179
180     url:            Final video URL.
181     ext:            Video filename extension.
182     format:         The video format, defaults to ext (used for --get-format)
183     player_url:     SWF Player URL (used for rtmpdump).
184
185     The following fields are optional:
186
187     alt_title:      A secondary title of the video.
188     display_id      An alternative identifier for the video, not necessarily
189                     unique, but available before title. Typically, id is
190                     something like "4234987", title "Dancing naked mole rats",
191                     and display_id "dancing-naked-mole-rats"
192     thumbnails:     A list of dictionaries, with the following entries:
193                         * "id" (optional, string) - Thumbnail format ID
194                         * "url"
195                         * "preference" (optional, int) - quality of the image
196                         * "width" (optional, int)
197                         * "height" (optional, int)
198                         * "resolution" (optional, string "{width}x{height"},
199                                         deprecated)
200                         * "filesize" (optional, int)
201     thumbnail:      Full URL to a video thumbnail image.
202     description:    Full video description.
203     uploader:       Full name of the video uploader.
204     license:        License name the video is licensed under.
205     creator:        The creator of the video.
206     release_date:   The date (YYYYMMDD) when the video was released.
207     timestamp:      UNIX timestamp of the moment the video became available.
208     upload_date:    Video upload date (YYYYMMDD).
209                     If not explicitly set, calculated from timestamp.
210     uploader_id:    Nickname or id of the video uploader.
211     uploader_url:   Full URL to a personal webpage of the video uploader.
212     location:       Physical location where the video was filmed.
213     subtitles:      The available subtitles as a dictionary in the format
214                     {tag: subformats}. "tag" is usually a language code, and
215                     "subformats" is a list sorted from lower to higher
216                     preference, each element is a dictionary with the "ext"
217                     entry and one of:
218                         * "data": The subtitles file contents
219                         * "url": A URL pointing to the subtitles file
220                     "ext" will be calculated from URL if missing
221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
222                     automatically generated captions
223     duration:       Length of the video in seconds, as an integer or float.
224     view_count:     How many users have watched the video on the platform.
225     like_count:     Number of positive ratings of the video
226     dislike_count:  Number of negative ratings of the video
227     repost_count:   Number of reposts of the video
228     average_rating: Average rating give by users, the scale used depends on the webpage
229     comment_count:  Number of comments on the video
230     comments:       A list of comments, each with one or more of the following
231                     properties (all but one of text or html optional):
232                         * "author" - human-readable name of the comment author
233                         * "author_id" - user ID of the comment author
234                         * "id" - Comment ID
235                         * "html" - Comment as HTML
236                         * "text" - Plain text of the comment
237                         * "timestamp" - UNIX timestamp of comment
238                         * "parent" - ID of the comment this one is replying to.
239                                      Set to "root" to indicate that this is a
240                                      comment to the original video.
241     age_limit:      Age restriction for the video, as an integer (years)
242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
243                     should allow to get the same result again. (It will be set
244                     by YoutubeDL if it's missing)
245     categories:     A list of categories that the video falls in, for example
246                     ["Sports", "Berlin"]
247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
248     is_live:        True, False, or None (=unknown). Whether this video is a
249                     live stream that goes on instead of a fixed-length video.
250     start_time:     Time in seconds where the reproduction should start, as
251                     specified in the URL.
252     end_time:       Time in seconds where the reproduction should end, as
253                     specified in the URL.
254     chapters:       A list of dictionaries, with the following entries:
255                         * "start_time" - The start time of the chapter in seconds
256                         * "end_time" - The end time of the chapter in seconds
257                         * "title" (optional, string)
258
259     The following fields should only be used when the video belongs to some logical
260     chapter or section:
261
262     chapter:        Name or title of the chapter the video belongs to.
263     chapter_number: Number of the chapter the video belongs to, as an integer.
264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
265
266     The following fields should only be used when the video is an episode of some
267     series, programme or podcast:
268
269     series:         Title of the series or programme the video episode belongs to.
270     season:         Title of the season the video episode belongs to.
271     season_number:  Number of the season the video episode belongs to, as an integer.
272     season_id:      Id of the season the video episode belongs to, as a unicode string.
273     episode:        Title of the video episode. Unlike mandatory video title field,
274                     this field should denote the exact title of the video episode
275                     without any kind of decoration.
276     episode_number: Number of the video episode within a season, as an integer.
277     episode_id:     Id of the video episode, as a unicode string.
278
279     The following fields should only be used when the media is a track or a part of
280     a music album:
281
282     track:          Title of the track.
283     track_number:   Number of the track within an album or a disc, as an integer.
284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
285                     as a unicode string.
286     artist:         Artist(s) of the track.
287     genre:          Genre(s) of the track.
288     album:          Title of the album the track belongs to.
289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
290     album_artist:   List of all artists appeared on the album (e.g.
291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
292                     and compilations).
293     disc_number:    Number of the disc or other physical medium the track belongs to,
294                     as an integer.
295     release_year:   Year (YYYY) when the album was released.
296
297     Unless mentioned otherwise, the fields should be Unicode strings.
298
299     Unless mentioned otherwise, None is equivalent to absence of information.
300
301
302     _type "playlist" indicates multiple videos.
303     There must be a key "entries", which is a list, an iterable, or a PagedList
304     object, each element of which is a valid dictionary by this specification.
305
306     Additionally, playlists can have "id", "title", "description", "uploader",
307     "uploader_id", "uploader_url" attributes with the same semantics as videos
308     (see above).
309
310
311     _type "multi_video" indicates that there are multiple videos that
312     form a single show, for examples multiple acts of an opera or TV episode.
313     It must have an entries key like a playlist and contain all the keys
314     required for a video at the same time.
315
316
317     _type "url" indicates that the video must be extracted from another
318     location, possibly by a different extractor. Its only required key is:
319     "url" - the next URL to extract.
320     The key "ie_key" can be set to the class name (minus the trailing "IE",
321     e.g. "Youtube") if the extractor class is known in advance.
322     Additionally, the dictionary may have any properties of the resolved entity
323     known in advance, for example "title" if the title of the referred video is
324     known ahead of time.
325
326
327     _type "url_transparent" entities have the same specification as "url", but
328     indicate that the given additional information is more precise than the one
329     associated with the resolved URL.
330     This is useful when a site employs a video service that hosts the video and
331     its technical metadata, but that video service does not embed a useful
332     title, description etc.
333
334
335     Subclasses of this one should re-define the _real_initialize() and
336     _real_extract() methods and define a _VALID_URL regexp.
337     Probably, they should also be added to the list of extractors.
338
339     _GEO_BYPASS attribute may be set to False in order to disable
340     geo restriction bypass mechanisms for a particular extractor.
341     Though it won't disable explicit geo restriction bypass based on
342     country code provided with geo_bypass_country. (experimental)
343
344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
345     countries for this extractor. One of these countries will be used by
346     geo restriction bypass mechanism right away in order to bypass
347     geo restriction, of course, if the mechanism is not disabled. (experimental)
348
349     NB: both these geo attributes are experimental and may change in future
350     or be completely removed.
351
352     Finally, the _WORKING attribute should be set to False for broken IEs
353     in order to warn the users and skip the tests.
354     """
355
356     _ready = False
357     _downloader = None
358     _x_forwarded_for_ip = None
359     _GEO_BYPASS = True
360     _GEO_COUNTRIES = None
361     _WORKING = True
362
363     def __init__(self, downloader=None):
364         """Constructor. Receives an optional downloader."""
365         self._ready = False
366         self._x_forwarded_for_ip = None
367         self.set_downloader(downloader)
368
369     @classmethod
370     def suitable(cls, url):
371         """Receives a URL and returns True if suitable for this IE."""
372
373         # This does not use has/getattr intentionally - we want to know whether
374         # we have cached the regexp for *this* class, whereas getattr would also
375         # match the superclass
376         if '_VALID_URL_RE' not in cls.__dict__:
377             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
378         return cls._VALID_URL_RE.match(url) is not None
379
380     @classmethod
381     def _match_id(cls, url):
382         if '_VALID_URL_RE' not in cls.__dict__:
383             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
384         m = cls._VALID_URL_RE.match(url)
385         assert m
386         return compat_str(m.group('id'))
387
388     @classmethod
389     def working(cls):
390         """Getter method for _WORKING."""
391         return cls._WORKING
392
393     def initialize(self):
394         """Initializes an instance (authentication, etc)."""
395         self._initialize_geo_bypass(self._GEO_COUNTRIES)
396         if not self._ready:
397             self._real_initialize()
398             self._ready = True
399
400     def _initialize_geo_bypass(self, countries):
401         """
402         Initialize geo restriction bypass mechanism.
403
404         This method is used to initialize geo bypass mechanism based on faking
405         X-Forwarded-For HTTP header. A random country from provided country list
406         is selected and a random IP belonging to this country is generated. This
407         IP will be passed as X-Forwarded-For HTTP header in all subsequent
408         HTTP requests.
409
410         This method will be used for initial geo bypass mechanism initialization
411         during the instance initialization with _GEO_COUNTRIES.
412
413         You may also manually call it from extractor's code if geo countries
414         information is not available beforehand (e.g. obtained during
415         extraction) or due to some another reason.
416         """
417         if not self._x_forwarded_for_ip:
418             country_code = self._downloader.params.get('geo_bypass_country', None)
419             # If there is no explicit country for geo bypass specified and
420             # the extractor is known to be geo restricted let's fake IP
421             # as X-Forwarded-For right away.
422             if (not country_code and
423                     self._GEO_BYPASS and
424                     self._downloader.params.get('geo_bypass', True) and
425                     countries):
426                 country_code = random.choice(countries)
427             if country_code:
428                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
429                 if self._downloader.params.get('verbose', False):
430                     self._downloader.to_screen(
431                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
432                         % (self._x_forwarded_for_ip, country_code.upper()))
433
434     def extract(self, url):
435         """Extracts URL information and returns it in list of dicts."""
436         try:
437             for _ in range(2):
438                 try:
439                     self.initialize()
440                     ie_result = self._real_extract(url)
441                     if self._x_forwarded_for_ip:
442                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
443                     return ie_result
444                 except GeoRestrictedError as e:
445                     if self.__maybe_fake_ip_and_retry(e.countries):
446                         continue
447                     raise
448         except ExtractorError:
449             raise
450         except compat_http_client.IncompleteRead as e:
451             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
452         except (KeyError, StopIteration) as e:
453             raise ExtractorError('An extractor error has occurred.', cause=e)
454
455     def __maybe_fake_ip_and_retry(self, countries):
456         if (not self._downloader.params.get('geo_bypass_country', None) and
457                 self._GEO_BYPASS and
458                 self._downloader.params.get('geo_bypass', True) and
459                 not self._x_forwarded_for_ip and
460                 countries):
461             country_code = random.choice(countries)
462             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
463             if self._x_forwarded_for_ip:
464                 self.report_warning(
465                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
466                     % (self._x_forwarded_for_ip, country_code.upper()))
467                 return True
468         return False
469
470     def set_downloader(self, downloader):
471         """Sets the downloader for this IE."""
472         self._downloader = downloader
473
474     def _real_initialize(self):
475         """Real initialization process. Redefine in subclasses."""
476         pass
477
478     def _real_extract(self, url):
479         """Real extraction process. Redefine in subclasses."""
480         pass
481
482     @classmethod
483     def ie_key(cls):
484         """A string for getting the InfoExtractor with get_info_extractor"""
485         return compat_str(cls.__name__[:-2])
486
487     @property
488     def IE_NAME(self):
489         return compat_str(type(self).__name__[:-2])
490
491     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
492         """ Returns the response handle """
493         if note is None:
494             self.report_download_webpage(video_id)
495         elif note is not False:
496             if video_id is None:
497                 self.to_screen('%s' % (note,))
498             else:
499                 self.to_screen('%s: %s' % (video_id, note))
500
501         # Some sites check X-Forwarded-For HTTP header in order to figure out
502         # the origin of the client behind proxy. This allows bypassing geo
503         # restriction by faking this header's value to IP that belongs to some
504         # geo unrestricted country. We will do so once we encounter any
505         # geo restriction error.
506         if self._x_forwarded_for_ip:
507             if 'X-Forwarded-For' not in headers:
508                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
509
510         if isinstance(url_or_request, compat_urllib_request.Request):
511             url_or_request = update_Request(
512                 url_or_request, data=data, headers=headers, query=query)
513         else:
514             if query:
515                 url_or_request = update_url_query(url_or_request, query)
516             if data is not None or headers:
517                 url_or_request = sanitized_Request(url_or_request, data, headers)
518         try:
519             return self._downloader.urlopen(url_or_request)
520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
521             if errnote is False:
522                 return False
523             if errnote is None:
524                 errnote = 'Unable to download webpage'
525
526             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
527             if fatal:
528                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
529             else:
530                 self._downloader.report_warning(errmsg)
531                 return False
532
533     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
534         """ Returns a tuple (page content as string, URL handle) """
535         # Strip hashes from the URL (#1038)
536         if isinstance(url_or_request, (compat_str, str)):
537             url_or_request = url_or_request.partition('#')[0]
538
539         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
540         if urlh is False:
541             assert not fatal
542             return False
543         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
544         return (content, urlh)
545
546     @staticmethod
547     def _guess_encoding_from_content(content_type, webpage_bytes):
548         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
549         if m:
550             encoding = m.group(1)
551         else:
552             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
553                           webpage_bytes[:1024])
554             if m:
555                 encoding = m.group(1).decode('ascii')
556             elif webpage_bytes.startswith(b'\xff\xfe'):
557                 encoding = 'utf-16'
558             else:
559                 encoding = 'utf-8'
560
561         return encoding
562
563     def __check_blocked(self, content):
564         first_block = content[:512]
565         if ('<title>Access to this site is blocked</title>' in content and
566                 'Websense' in first_block):
567             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
568             blocked_iframe = self._html_search_regex(
569                 r'<iframe src="([^"]+)"', content,
570                 'Websense information URL', default=None)
571             if blocked_iframe:
572                 msg += ' Visit %s for more details' % blocked_iframe
573             raise ExtractorError(msg, expected=True)
574         if '<title>The URL you requested has been blocked</title>' in first_block:
575             msg = (
576                 'Access to this webpage has been blocked by Indian censorship. '
577                 'Use a VPN or proxy server (with --proxy) to route around it.')
578             block_msg = self._html_search_regex(
579                 r'</h1><p>(.*?)</p>',
580                 content, 'block message', default=None)
581             if block_msg:
582                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
583             raise ExtractorError(msg, expected=True)
584         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
585                 'blocklist.rkn.gov.ru' in content):
586             raise ExtractorError(
587                 'Access to this webpage has been blocked by decision of the Russian government. '
588                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
589                 expected=True)
590
591     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
592         content_type = urlh.headers.get('Content-Type', '')
593         webpage_bytes = urlh.read()
594         if prefix is not None:
595             webpage_bytes = prefix + webpage_bytes
596         if not encoding:
597             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
598         if self._downloader.params.get('dump_intermediate_pages', False):
599             self.to_screen('Dumping request to ' + urlh.geturl())
600             dump = base64.b64encode(webpage_bytes).decode('ascii')
601             self._downloader.to_screen(dump)
602         if self._downloader.params.get('write_pages', False):
603             basen = '%s_%s' % (video_id, urlh.geturl())
604             if len(basen) > 240:
605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
606                 basen = basen[:240 - len(h)] + h
607             raw_filename = basen + '.dump'
608             filename = sanitize_filename(raw_filename, restricted=True)
609             self.to_screen('Saving request to ' + filename)
610             # Working around MAX_PATH limitation on Windows (see
611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
612             if compat_os_name == 'nt':
613                 absfilepath = os.path.abspath(filename)
614                 if len(absfilepath) > 259:
615                     filename = '\\\\?\\' + absfilepath
616             with open(filename, 'wb') as outf:
617                 outf.write(webpage_bytes)
618
619         try:
620             content = webpage_bytes.decode(encoding, 'replace')
621         except LookupError:
622             content = webpage_bytes.decode('utf-8', 'replace')
623
624         self.__check_blocked(content)
625
626         return content
627
628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
629         """ Returns the data of the page as a string """
630         success = False
631         try_count = 0
632         while success is False:
633             try:
634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
635                 success = True
636             except compat_http_client.IncompleteRead as e:
637                 try_count += 1
638                 if try_count >= tries:
639                     raise e
640                 self._sleep(timeout, video_id)
641         if res is False:
642             return res
643         else:
644             content, _ = res
645             return content
646
647     def _download_xml_handle(
648             self, url_or_request, video_id, note='Downloading XML',
649             errnote='Unable to download XML', transform_source=None,
650             fatal=True, encoding=None, data=None, headers={}, query={}):
651         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
652         res = self._download_webpage_handle(
653             url_or_request, video_id, note, errnote, fatal=fatal,
654             encoding=encoding, data=data, headers=headers, query=query)
655         if res is False:
656             return res
657         xml_string, urlh = res
658         return self._parse_xml(
659             xml_string, video_id, transform_source=transform_source,
660             fatal=fatal), urlh
661
662     def _download_xml(self, url_or_request, video_id,
663                       note='Downloading XML', errnote='Unable to download XML',
664                       transform_source=None, fatal=True, encoding=None,
665                       data=None, headers={}, query={}):
666         """Return the xml as an xml.etree.ElementTree.Element"""
667         res = self._download_xml_handle(
668             url_or_request, video_id, note=note, errnote=errnote,
669             transform_source=transform_source, fatal=fatal, encoding=encoding,
670             data=data, headers=headers, query=query)
671         return res if res is False else res[0]
672
673     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
674         if transform_source:
675             xml_string = transform_source(xml_string)
676         try:
677             return compat_etree_fromstring(xml_string.encode('utf-8'))
678         except compat_xml_parse_error as ve:
679             errmsg = '%s: Failed to parse XML ' % video_id
680             if fatal:
681                 raise ExtractorError(errmsg, cause=ve)
682             else:
683                 self.report_warning(errmsg + str(ve))
684
685     def _download_json_handle(
686             self, url_or_request, video_id, note='Downloading JSON metadata',
687             errnote='Unable to download JSON metadata', transform_source=None,
688             fatal=True, encoding=None, data=None, headers={}, query={}):
689         """Return a tuple (JSON object, URL handle)"""
690         res = self._download_webpage_handle(
691             url_or_request, video_id, note, errnote, fatal=fatal,
692             encoding=encoding, data=data, headers=headers, query=query)
693         if res is False:
694             return res
695         json_string, urlh = res
696         return self._parse_json(
697             json_string, video_id, transform_source=transform_source,
698             fatal=fatal), urlh
699
700     def _download_json(
701             self, url_or_request, video_id, note='Downloading JSON metadata',
702             errnote='Unable to download JSON metadata', transform_source=None,
703             fatal=True, encoding=None, data=None, headers={}, query={}):
704         res = self._download_json_handle(
705             url_or_request, video_id, note=note, errnote=errnote,
706             transform_source=transform_source, fatal=fatal, encoding=encoding,
707             data=data, headers=headers, query=query)
708         return res if res is False else res[0]
709
710     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
711         if transform_source:
712             json_string = transform_source(json_string)
713         try:
714             return json.loads(json_string)
715         except ValueError as ve:
716             errmsg = '%s: Failed to parse JSON ' % video_id
717             if fatal:
718                 raise ExtractorError(errmsg, cause=ve)
719             else:
720                 self.report_warning(errmsg + str(ve))
721
722     def report_warning(self, msg, video_id=None):
723         idstr = '' if video_id is None else '%s: ' % video_id
724         self._downloader.report_warning(
725             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
726
727     def to_screen(self, msg):
728         """Print msg to screen, prefixing it with '[ie_name]'"""
729         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
730
731     def report_extraction(self, id_or_name):
732         """Report information extraction."""
733         self.to_screen('%s: Extracting information' % id_or_name)
734
735     def report_download_webpage(self, video_id):
736         """Report webpage download."""
737         self.to_screen('%s: Downloading webpage' % video_id)
738
739     def report_age_confirmation(self):
740         """Report attempt to confirm age."""
741         self.to_screen('Confirming age')
742
743     def report_login(self):
744         """Report attempt to log in."""
745         self.to_screen('Logging in')
746
747     @staticmethod
748     def raise_login_required(msg='This video is only available for registered users'):
749         raise ExtractorError(
750             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
751             expected=True)
752
753     @staticmethod
754     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
755         raise GeoRestrictedError(msg, countries=countries)
756
757     # Methods for following #608
758     @staticmethod
759     def url_result(url, ie=None, video_id=None, video_title=None):
760         """Returns a URL that points to a page that should be processed"""
761         # TODO: ie should be the class used for getting the info
762         video_info = {'_type': 'url',
763                       'url': url,
764                       'ie_key': ie}
765         if video_id is not None:
766             video_info['id'] = video_id
767         if video_title is not None:
768             video_info['title'] = video_title
769         return video_info
770
771     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
772         urls = orderedSet(
773             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
774             for m in matches)
775         return self.playlist_result(
776             urls, playlist_id=playlist_id, playlist_title=playlist_title)
777
778     @staticmethod
779     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
780         """Returns a playlist"""
781         video_info = {'_type': 'playlist',
782                       'entries': entries}
783         if playlist_id:
784             video_info['id'] = playlist_id
785         if playlist_title:
786             video_info['title'] = playlist_title
787         if playlist_description:
788             video_info['description'] = playlist_description
789         return video_info
790
791     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
792         """
793         Perform a regex search on the given string, using a single or a list of
794         patterns returning the first matching group.
795         In case of failure return a default value or raise a WARNING or a
796         RegexNotFoundError, depending on fatal, specifying the field name.
797         """
798         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
799             mobj = re.search(pattern, string, flags)
800         else:
801             for p in pattern:
802                 mobj = re.search(p, string, flags)
803                 if mobj:
804                     break
805
806         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
807             _name = '\033[0;34m%s\033[0m' % name
808         else:
809             _name = name
810
811         if mobj:
812             if group is None:
813                 # return the first matching group
814                 return next(g for g in mobj.groups() if g is not None)
815             else:
816                 return mobj.group(group)
817         elif default is not NO_DEFAULT:
818             return default
819         elif fatal:
820             raise RegexNotFoundError('Unable to extract %s' % _name)
821         else:
822             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
823             return None
824
825     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
826         """
827         Like _search_regex, but strips HTML tags and unescapes entities.
828         """
829         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
830         if res:
831             return clean_html(res).strip()
832         else:
833             return res
834
835     def _get_netrc_login_info(self, netrc_machine=None):
836         username = None
837         password = None
838         netrc_machine = netrc_machine or self._NETRC_MACHINE
839
840         if self._downloader.params.get('usenetrc', False):
841             try:
842                 info = netrc.netrc().authenticators(netrc_machine)
843                 if info is not None:
844                     username = info[0]
845                     password = info[2]
846                 else:
847                     raise netrc.NetrcParseError(
848                         'No authenticators for %s' % netrc_machine)
849             except (IOError, netrc.NetrcParseError) as err:
850                 self._downloader.report_warning(
851                     'parsing .netrc: %s' % error_to_compat_str(err))
852
853         return username, password
854
855     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
856         """
857         Get the login info as (username, password)
858         First look for the manually specified credentials using username_option
859         and password_option as keys in params dictionary. If no such credentials
860         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
861         value.
862         If there's no info available, return (None, None)
863         """
864         if self._downloader is None:
865             return (None, None)
866
867         downloader_params = self._downloader.params
868
869         # Attempt to use provided username and password or .netrc data
870         if downloader_params.get(username_option) is not None:
871             username = downloader_params[username_option]
872             password = downloader_params[password_option]
873         else:
874             username, password = self._get_netrc_login_info(netrc_machine)
875
876         return username, password
877
878     def _get_tfa_info(self, note='two-factor verification code'):
879         """
880         Get the two-factor authentication info
881         TODO - asking the user will be required for sms/phone verify
882         currently just uses the command line option
883         If there's no info available, return None
884         """
885         if self._downloader is None:
886             return None
887         downloader_params = self._downloader.params
888
889         if downloader_params.get('twofactor') is not None:
890             return downloader_params['twofactor']
891
892         return compat_getpass('Type %s and press [Return]: ' % note)
893
894     # Helper functions for extracting OpenGraph info
895     @staticmethod
896     def _og_regexes(prop):
897         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
898         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
899                        % {'prop': re.escape(prop)})
900         template = r'<meta[^>]+?%s[^>]+?%s'
901         return [
902             template % (property_re, content_re),
903             template % (content_re, property_re),
904         ]
905
906     @staticmethod
907     def _meta_regex(prop):
908         return r'''(?isx)<meta
909                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
910                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
911
912     def _og_search_property(self, prop, html, name=None, **kargs):
913         if not isinstance(prop, (list, tuple)):
914             prop = [prop]
915         if name is None:
916             name = 'OpenGraph %s' % prop[0]
917         og_regexes = []
918         for p in prop:
919             og_regexes.extend(self._og_regexes(p))
920         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
921         if escaped is None:
922             return None
923         return unescapeHTML(escaped)
924
925     def _og_search_thumbnail(self, html, **kargs):
926         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
927
928     def _og_search_description(self, html, **kargs):
929         return self._og_search_property('description', html, fatal=False, **kargs)
930
931     def _og_search_title(self, html, **kargs):
932         return self._og_search_property('title', html, **kargs)
933
934     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
935         regexes = self._og_regexes('video') + self._og_regexes('video:url')
936         if secure:
937             regexes = self._og_regexes('video:secure_url') + regexes
938         return self._html_search_regex(regexes, html, name, **kargs)
939
940     def _og_search_url(self, html, **kargs):
941         return self._og_search_property('url', html, **kargs)
942
943     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
944         if not isinstance(name, (list, tuple)):
945             name = [name]
946         if display_name is None:
947             display_name = name[0]
948         return self._html_search_regex(
949             [self._meta_regex(n) for n in name],
950             html, display_name, fatal=fatal, group='content', **kwargs)
951
952     def _dc_search_uploader(self, html):
953         return self._html_search_meta('dc.creator', html, 'uploader')
954
955     def _rta_search(self, html):
956         # See http://www.rtalabel.org/index.php?content=howtofaq#single
957         if re.search(r'(?ix)<meta\s+name="rating"\s+'
958                      r'     content="RTA-5042-1996-1400-1577-RTA"',
959                      html):
960             return 18
961         return 0
962
963     def _media_rating_search(self, html):
964         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
965         rating = self._html_search_meta('rating', html)
966
967         if not rating:
968             return None
969
970         RATING_TABLE = {
971             'safe for kids': 0,
972             'general': 8,
973             '14 years': 14,
974             'mature': 17,
975             'restricted': 19,
976         }
977         return RATING_TABLE.get(rating.lower())
978
979     def _family_friendly_search(self, html):
980         # See http://schema.org/VideoObject
981         family_friendly = self._html_search_meta(
982             'isFamilyFriendly', html, default=None)
983
984         if not family_friendly:
985             return None
986
987         RATING_TABLE = {
988             '1': 0,
989             'true': 0,
990             '0': 18,
991             'false': 18,
992         }
993         return RATING_TABLE.get(family_friendly.lower())
994
995     def _twitter_search_player(self, html):
996         return self._html_search_meta('twitter:player', html,
997                                       'twitter card player')
998
999     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1000         json_ld = self._search_regex(
1001             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1002             html, 'JSON-LD', group='json_ld', **kwargs)
1003         default = kwargs.get('default', NO_DEFAULT)
1004         if not json_ld:
1005             return default if default is not NO_DEFAULT else {}
1006         # JSON-LD may be malformed and thus `fatal` should be respected.
1007         # At the same time `default` may be passed that assumes `fatal=False`
1008         # for _search_regex. Let's simulate the same behavior here as well.
1009         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1010         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1011
1012     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1013         if isinstance(json_ld, compat_str):
1014             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1015         if not json_ld:
1016             return {}
1017         info = {}
1018         if not isinstance(json_ld, (list, tuple, dict)):
1019             return info
1020         if isinstance(json_ld, dict):
1021             json_ld = [json_ld]
1022
1023         INTERACTION_TYPE_MAP = {
1024             'CommentAction': 'comment',
1025             'AgreeAction': 'like',
1026             'DisagreeAction': 'dislike',
1027             'LikeAction': 'like',
1028             'DislikeAction': 'dislike',
1029             'ListenAction': 'view',
1030             'WatchAction': 'view',
1031             'ViewAction': 'view',
1032         }
1033
1034         def extract_interaction_statistic(e):
1035             interaction_statistic = e.get('interactionStatistic')
1036             if not isinstance(interaction_statistic, list):
1037                 return
1038             for is_e in interaction_statistic:
1039                 if not isinstance(is_e, dict):
1040                     continue
1041                 if is_e.get('@type') != 'InteractionCounter':
1042                     continue
1043                 interaction_type = is_e.get('interactionType')
1044                 if not isinstance(interaction_type, compat_str):
1045                     continue
1046                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1047                 if interaction_count is None:
1048                     continue
1049                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1050                 if not count_kind:
1051                     continue
1052                 count_key = '%s_count' % count_kind
1053                 if info.get(count_key) is not None:
1054                     continue
1055                 info[count_key] = interaction_count
1056
1057         def extract_video_object(e):
1058             assert e['@type'] == 'VideoObject'
1059             info.update({
1060                 'url': e.get('contentUrl'),
1061                 'title': unescapeHTML(e.get('name')),
1062                 'description': unescapeHTML(e.get('description')),
1063                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1064                 'duration': parse_duration(e.get('duration')),
1065                 'timestamp': unified_timestamp(e.get('uploadDate')),
1066                 'filesize': float_or_none(e.get('contentSize')),
1067                 'tbr': int_or_none(e.get('bitrate')),
1068                 'width': int_or_none(e.get('width')),
1069                 'height': int_or_none(e.get('height')),
1070                 'view_count': int_or_none(e.get('interactionCount')),
1071             })
1072             extract_interaction_statistic(e)
1073
1074         for e in json_ld:
1075             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1076                 item_type = e.get('@type')
1077                 if expected_type is not None and expected_type != item_type:
1078                     return info
1079                 if item_type in ('TVEpisode', 'Episode'):
1080                     info.update({
1081                         'episode': unescapeHTML(e.get('name')),
1082                         'episode_number': int_or_none(e.get('episodeNumber')),
1083                         'description': unescapeHTML(e.get('description')),
1084                     })
1085                     part_of_season = e.get('partOfSeason')
1086                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1087                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1088                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1089                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1090                         info['series'] = unescapeHTML(part_of_series.get('name'))
1091                 elif item_type in ('Article', 'NewsArticle'):
1092                     info.update({
1093                         'timestamp': parse_iso8601(e.get('datePublished')),
1094                         'title': unescapeHTML(e.get('headline')),
1095                         'description': unescapeHTML(e.get('articleBody')),
1096                     })
1097                 elif item_type == 'VideoObject':
1098                     extract_video_object(e)
1099                     continue
1100                 video = e.get('video')
1101                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1102                     extract_video_object(video)
1103                 break
1104         return dict((k, v) for k, v in info.items() if v is not None)
1105
1106     @staticmethod
1107     def _hidden_inputs(html):
1108         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1109         hidden_inputs = {}
1110         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1111             attrs = extract_attributes(input)
1112             if not input:
1113                 continue
1114             if attrs.get('type') not in ('hidden', 'submit'):
1115                 continue
1116             name = attrs.get('name') or attrs.get('id')
1117             value = attrs.get('value')
1118             if name and value is not None:
1119                 hidden_inputs[name] = value
1120         return hidden_inputs
1121
1122     def _form_hidden_inputs(self, form_id, html):
1123         form = self._search_regex(
1124             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1125             html, '%s form' % form_id, group='form')
1126         return self._hidden_inputs(form)
1127
1128     def _sort_formats(self, formats, field_preference=None):
1129         if not formats:
1130             raise ExtractorError('No video formats found')
1131
1132         for f in formats:
1133             # Automatically determine tbr when missing based on abr and vbr (improves
1134             # formats sorting in some cases)
1135             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1136                 f['tbr'] = f['abr'] + f['vbr']
1137
1138         def _formats_key(f):
1139             # TODO remove the following workaround
1140             from ..utils import determine_ext
1141             if not f.get('ext') and 'url' in f:
1142                 f['ext'] = determine_ext(f['url'])
1143
1144             if isinstance(field_preference, (list, tuple)):
1145                 return tuple(
1146                     f.get(field)
1147                     if f.get(field) is not None
1148                     else ('' if field == 'format_id' else -1)
1149                     for field in field_preference)
1150
1151             preference = f.get('preference')
1152             if preference is None:
1153                 preference = 0
1154                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1155                     preference -= 0.5
1156
1157             protocol = f.get('protocol') or determine_protocol(f)
1158             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1159
1160             if f.get('vcodec') == 'none':  # audio only
1161                 preference -= 50
1162                 if self._downloader.params.get('prefer_free_formats'):
1163                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1164                 else:
1165                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1166                 ext_preference = 0
1167                 try:
1168                     audio_ext_preference = ORDER.index(f['ext'])
1169                 except ValueError:
1170                     audio_ext_preference = -1
1171             else:
1172                 if f.get('acodec') == 'none':  # video only
1173                     preference -= 40
1174                 if self._downloader.params.get('prefer_free_formats'):
1175                     ORDER = ['flv', 'mp4', 'webm']
1176                 else:
1177                     ORDER = ['webm', 'flv', 'mp4']
1178                 try:
1179                     ext_preference = ORDER.index(f['ext'])
1180                 except ValueError:
1181                     ext_preference = -1
1182                 audio_ext_preference = 0
1183
1184             return (
1185                 preference,
1186                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1187                 f.get('quality') if f.get('quality') is not None else -1,
1188                 f.get('tbr') if f.get('tbr') is not None else -1,
1189                 f.get('filesize') if f.get('filesize') is not None else -1,
1190                 f.get('vbr') if f.get('vbr') is not None else -1,
1191                 f.get('height') if f.get('height') is not None else -1,
1192                 f.get('width') if f.get('width') is not None else -1,
1193                 proto_preference,
1194                 ext_preference,
1195                 f.get('abr') if f.get('abr') is not None else -1,
1196                 audio_ext_preference,
1197                 f.get('fps') if f.get('fps') is not None else -1,
1198                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1199                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1200                 f.get('format_id') if f.get('format_id') is not None else '',
1201             )
1202         formats.sort(key=_formats_key)
1203
1204     def _check_formats(self, formats, video_id):
1205         if formats:
1206             formats[:] = filter(
1207                 lambda f: self._is_valid_url(
1208                     f['url'], video_id,
1209                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1210                 formats)
1211
1212     @staticmethod
1213     def _remove_duplicate_formats(formats):
1214         format_urls = set()
1215         unique_formats = []
1216         for f in formats:
1217             if f['url'] not in format_urls:
1218                 format_urls.add(f['url'])
1219                 unique_formats.append(f)
1220         formats[:] = unique_formats
1221
1222     def _is_valid_url(self, url, video_id, item='video', headers={}):
1223         url = self._proto_relative_url(url, scheme='http:')
1224         # For now assume non HTTP(S) URLs always valid
1225         if not (url.startswith('http://') or url.startswith('https://')):
1226             return True
1227         try:
1228             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1229             return True
1230         except ExtractorError as e:
1231             if isinstance(e.cause, compat_urllib_error.URLError):
1232                 self.to_screen(
1233                     '%s: %s URL is invalid, skipping' % (video_id, item))
1234                 return False
1235             raise
1236
1237     def http_scheme(self):
1238         """ Either "http:" or "https:", depending on the user's preferences """
1239         return (
1240             'http:'
1241             if self._downloader.params.get('prefer_insecure', False)
1242             else 'https:')
1243
1244     def _proto_relative_url(self, url, scheme=None):
1245         if url is None:
1246             return url
1247         if url.startswith('//'):
1248             if scheme is None:
1249                 scheme = self.http_scheme()
1250             return scheme + url
1251         else:
1252             return url
1253
1254     def _sleep(self, timeout, video_id, msg_template=None):
1255         if msg_template is None:
1256             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1257         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1258         self.to_screen(msg)
1259         time.sleep(timeout)
1260
1261     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1262                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1263                              fatal=True, m3u8_id=None):
1264         manifest = self._download_xml(
1265             manifest_url, video_id, 'Downloading f4m manifest',
1266             'Unable to download f4m manifest',
1267             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1268             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1269             transform_source=transform_source,
1270             fatal=fatal)
1271
1272         if manifest is False:
1273             return []
1274
1275         return self._parse_f4m_formats(
1276             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1277             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1278
1279     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1280                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1281                            fatal=True, m3u8_id=None):
1282         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1283         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1284         if akamai_pv is not None and ';' in akamai_pv.text:
1285             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1286             if playerVerificationChallenge.strip() != '':
1287                 return []
1288
1289         formats = []
1290         manifest_version = '1.0'
1291         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1292         if not media_nodes:
1293             manifest_version = '2.0'
1294             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1295         # Remove unsupported DRM protected media from final formats
1296         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1297         media_nodes = remove_encrypted_media(media_nodes)
1298         if not media_nodes:
1299             return formats
1300
1301         manifest_base_url = get_base_url(manifest)
1302
1303         bootstrap_info = xpath_element(
1304             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1305             'bootstrap info', default=None)
1306
1307         vcodec = None
1308         mime_type = xpath_text(
1309             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1310             'base URL', default=None)
1311         if mime_type and mime_type.startswith('audio/'):
1312             vcodec = 'none'
1313
1314         for i, media_el in enumerate(media_nodes):
1315             tbr = int_or_none(media_el.attrib.get('bitrate'))
1316             width = int_or_none(media_el.attrib.get('width'))
1317             height = int_or_none(media_el.attrib.get('height'))
1318             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1319             # If <bootstrapInfo> is present, the specified f4m is a
1320             # stream-level manifest, and only set-level manifests may refer to
1321             # external resources.  See section 11.4 and section 4 of F4M spec
1322             if bootstrap_info is None:
1323                 media_url = None
1324                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1325                 if manifest_version == '2.0':
1326                     media_url = media_el.attrib.get('href')
1327                 if media_url is None:
1328                     media_url = media_el.attrib.get('url')
1329                 if not media_url:
1330                     continue
1331                 manifest_url = (
1332                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1333                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1334                 # If media_url is itself a f4m manifest do the recursive extraction
1335                 # since bitrates in parent manifest (this one) and media_url manifest
1336                 # may differ leading to inability to resolve the format by requested
1337                 # bitrate in f4m downloader
1338                 ext = determine_ext(manifest_url)
1339                 if ext == 'f4m':
1340                     f4m_formats = self._extract_f4m_formats(
1341                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1342                         transform_source=transform_source, fatal=fatal)
1343                     # Sometimes stream-level manifest contains single media entry that
1344                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1345                     # At the same time parent's media entry in set-level manifest may
1346                     # contain it. We will copy it from parent in such cases.
1347                     if len(f4m_formats) == 1:
1348                         f = f4m_formats[0]
1349                         f.update({
1350                             'tbr': f.get('tbr') or tbr,
1351                             'width': f.get('width') or width,
1352                             'height': f.get('height') or height,
1353                             'format_id': f.get('format_id') if not tbr else format_id,
1354                             'vcodec': vcodec,
1355                         })
1356                     formats.extend(f4m_formats)
1357                     continue
1358                 elif ext == 'm3u8':
1359                     formats.extend(self._extract_m3u8_formats(
1360                         manifest_url, video_id, 'mp4', preference=preference,
1361                         m3u8_id=m3u8_id, fatal=fatal))
1362                     continue
1363             formats.append({
1364                 'format_id': format_id,
1365                 'url': manifest_url,
1366                 'manifest_url': manifest_url,
1367                 'ext': 'flv' if bootstrap_info is not None else None,
1368                 'protocol': 'f4m',
1369                 'tbr': tbr,
1370                 'width': width,
1371                 'height': height,
1372                 'vcodec': vcodec,
1373                 'preference': preference,
1374             })
1375         return formats
1376
1377     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1378         return {
1379             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1380             'url': m3u8_url,
1381             'ext': ext,
1382             'protocol': 'm3u8',
1383             'preference': preference - 100 if preference else -100,
1384             'resolution': 'multiple',
1385             'format_note': 'Quality selection URL',
1386         }
1387
1388     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1389                               entry_protocol='m3u8', preference=None,
1390                               m3u8_id=None, note=None, errnote=None,
1391                               fatal=True, live=False):
1392         res = self._download_webpage_handle(
1393             m3u8_url, video_id,
1394             note=note or 'Downloading m3u8 information',
1395             errnote=errnote or 'Failed to download m3u8 information',
1396             fatal=fatal)
1397
1398         if res is False:
1399             return []
1400
1401         m3u8_doc, urlh = res
1402         m3u8_url = urlh.geturl()
1403
1404         return self._parse_m3u8_formats(
1405             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1406             preference=preference, m3u8_id=m3u8_id, live=live)
1407
1408     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1409                             entry_protocol='m3u8', preference=None,
1410                             m3u8_id=None, live=False):
1411         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1412             return []
1413
1414         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1415             return []
1416
1417         formats = []
1418
1419         format_url = lambda u: (
1420             u
1421             if re.match(r'^https?://', u)
1422             else compat_urlparse.urljoin(m3u8_url, u))
1423
1424         # References:
1425         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1426         # 2. https://github.com/rg3/youtube-dl/issues/12211
1427
1428         # We should try extracting formats only from master playlists [1, 4.3.4],
1429         # i.e. playlists that describe available qualities. On the other hand
1430         # media playlists [1, 4.3.3] should be returned as is since they contain
1431         # just the media without qualities renditions.
1432         # Fortunately, master playlist can be easily distinguished from media
1433         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1434         # master playlist tags MUST NOT appear in a media playist and vice versa.
1435         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1436         # media playlist and MUST NOT appear in master playlist thus we can
1437         # clearly detect media playlist with this criterion.
1438
1439         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1440             return [{
1441                 'url': m3u8_url,
1442                 'format_id': m3u8_id,
1443                 'ext': ext,
1444                 'protocol': entry_protocol,
1445                 'preference': preference,
1446             }]
1447
1448         groups = {}
1449         last_stream_inf = {}
1450
1451         def extract_media(x_media_line):
1452             media = parse_m3u8_attributes(x_media_line)
1453             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1454             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1455             if not (media_type and group_id and name):
1456                 return
1457             groups.setdefault(group_id, []).append(media)
1458             if media_type not in ('VIDEO', 'AUDIO'):
1459                 return
1460             media_url = media.get('URI')
1461             if media_url:
1462                 format_id = []
1463                 for v in (m3u8_id, group_id, name):
1464                     if v:
1465                         format_id.append(v)
1466                 f = {
1467                     'format_id': '-'.join(format_id),
1468                     'url': format_url(media_url),
1469                     'manifest_url': m3u8_url,
1470                     'language': media.get('LANGUAGE'),
1471                     'ext': ext,
1472                     'protocol': entry_protocol,
1473                     'preference': preference,
1474                 }
1475                 if media_type == 'AUDIO':
1476                     f['vcodec'] = 'none'
1477                 formats.append(f)
1478
1479         def build_stream_name():
1480             # Despite specification does not mention NAME attribute for
1481             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1482             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1483             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1484             stream_name = last_stream_inf.get('NAME')
1485             if stream_name:
1486                 return stream_name
1487             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1488             # from corresponding rendition group
1489             stream_group_id = last_stream_inf.get('VIDEO')
1490             if not stream_group_id:
1491                 return
1492             stream_group = groups.get(stream_group_id)
1493             if not stream_group:
1494                 return stream_group_id
1495             rendition = stream_group[0]
1496             return rendition.get('NAME') or stream_group_id
1497
1498         for line in m3u8_doc.splitlines():
1499             if line.startswith('#EXT-X-STREAM-INF:'):
1500                 last_stream_inf = parse_m3u8_attributes(line)
1501             elif line.startswith('#EXT-X-MEDIA:'):
1502                 extract_media(line)
1503             elif line.startswith('#') or not line.strip():
1504                 continue
1505             else:
1506                 tbr = float_or_none(
1507                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1508                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1509                 format_id = []
1510                 if m3u8_id:
1511                     format_id.append(m3u8_id)
1512                 stream_name = build_stream_name()
1513                 # Bandwidth of live streams may differ over time thus making
1514                 # format_id unpredictable. So it's better to keep provided
1515                 # format_id intact.
1516                 if not live:
1517                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1518                 manifest_url = format_url(line.strip())
1519                 f = {
1520                     'format_id': '-'.join(format_id),
1521                     'url': manifest_url,
1522                     'manifest_url': m3u8_url,
1523                     'tbr': tbr,
1524                     'ext': ext,
1525                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1526                     'protocol': entry_protocol,
1527                     'preference': preference,
1528                 }
1529                 resolution = last_stream_inf.get('RESOLUTION')
1530                 if resolution:
1531                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1532                     if mobj:
1533                         f['width'] = int(mobj.group('width'))
1534                         f['height'] = int(mobj.group('height'))
1535                 # Unified Streaming Platform
1536                 mobj = re.search(
1537                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1538                 if mobj:
1539                     abr, vbr = mobj.groups()
1540                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1541                     f.update({
1542                         'vbr': vbr,
1543                         'abr': abr,
1544                     })
1545                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1546                 f.update(codecs)
1547                 audio_group_id = last_stream_inf.get('AUDIO')
1548                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1549                 # references a rendition group MUST have a CODECS attribute.
1550                 # However, this is not always respected, for example, [2]
1551                 # contains EXT-X-STREAM-INF tag which references AUDIO
1552                 # rendition group but does not have CODECS and despite
1553                 # referencing audio group an audio group, it represents
1554                 # a complete (with audio and video) format. So, for such cases
1555                 # we will ignore references to rendition groups and treat them
1556                 # as complete formats.
1557                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1558                     audio_group = groups.get(audio_group_id)
1559                     if audio_group and audio_group[0].get('URI'):
1560                         # TODO: update acodec for audio only formats with
1561                         # the same GROUP-ID
1562                         f['acodec'] = 'none'
1563                 formats.append(f)
1564                 last_stream_inf = {}
1565         return formats
1566
1567     @staticmethod
1568     def _xpath_ns(path, namespace=None):
1569         if not namespace:
1570             return path
1571         out = []
1572         for c in path.split('/'):
1573             if not c or c == '.':
1574                 out.append(c)
1575             else:
1576                 out.append('{%s}%s' % (namespace, c))
1577         return '/'.join(out)
1578
1579     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1580         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1581
1582         if smil is False:
1583             assert not fatal
1584             return []
1585
1586         namespace = self._parse_smil_namespace(smil)
1587
1588         return self._parse_smil_formats(
1589             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1590
1591     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1592         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1593         if smil is False:
1594             return {}
1595         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1596
1597     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1598         return self._download_xml(
1599             smil_url, video_id, 'Downloading SMIL file',
1600             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1601
1602     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1603         namespace = self._parse_smil_namespace(smil)
1604
1605         formats = self._parse_smil_formats(
1606             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1607         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1608
1609         video_id = os.path.splitext(url_basename(smil_url))[0]
1610         title = None
1611         description = None
1612         upload_date = None
1613         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1614             name = meta.attrib.get('name')
1615             content = meta.attrib.get('content')
1616             if not name or not content:
1617                 continue
1618             if not title and name == 'title':
1619                 title = content
1620             elif not description and name in ('description', 'abstract'):
1621                 description = content
1622             elif not upload_date and name == 'date':
1623                 upload_date = unified_strdate(content)
1624
1625         thumbnails = [{
1626             'id': image.get('type'),
1627             'url': image.get('src'),
1628             'width': int_or_none(image.get('width')),
1629             'height': int_or_none(image.get('height')),
1630         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1631
1632         return {
1633             'id': video_id,
1634             'title': title or video_id,
1635             'description': description,
1636             'upload_date': upload_date,
1637             'thumbnails': thumbnails,
1638             'formats': formats,
1639             'subtitles': subtitles,
1640         }
1641
1642     def _parse_smil_namespace(self, smil):
1643         return self._search_regex(
1644             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1645
1646     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1647         base = smil_url
1648         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1649             b = meta.get('base') or meta.get('httpBase')
1650             if b:
1651                 base = b
1652                 break
1653
1654         formats = []
1655         rtmp_count = 0
1656         http_count = 0
1657         m3u8_count = 0
1658
1659         srcs = []
1660         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1661         for medium in media:
1662             src = medium.get('src')
1663             if not src or src in srcs:
1664                 continue
1665             srcs.append(src)
1666
1667             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1668             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1669             width = int_or_none(medium.get('width'))
1670             height = int_or_none(medium.get('height'))
1671             proto = medium.get('proto')
1672             ext = medium.get('ext')
1673             src_ext = determine_ext(src)
1674             streamer = medium.get('streamer') or base
1675
1676             if proto == 'rtmp' or streamer.startswith('rtmp'):
1677                 rtmp_count += 1
1678                 formats.append({
1679                     'url': streamer,
1680                     'play_path': src,
1681                     'ext': 'flv',
1682                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1683                     'tbr': bitrate,
1684                     'filesize': filesize,
1685                     'width': width,
1686                     'height': height,
1687                 })
1688                 if transform_rtmp_url:
1689                     streamer, src = transform_rtmp_url(streamer, src)
1690                     formats[-1].update({
1691                         'url': streamer,
1692                         'play_path': src,
1693                     })
1694                 continue
1695
1696             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1697             src_url = src_url.strip()
1698
1699             if proto == 'm3u8' or src_ext == 'm3u8':
1700                 m3u8_formats = self._extract_m3u8_formats(
1701                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1702                 if len(m3u8_formats) == 1:
1703                     m3u8_count += 1
1704                     m3u8_formats[0].update({
1705                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1706                         'tbr': bitrate,
1707                         'width': width,
1708                         'height': height,
1709                     })
1710                 formats.extend(m3u8_formats)
1711                 continue
1712
1713             if src_ext == 'f4m':
1714                 f4m_url = src_url
1715                 if not f4m_params:
1716                     f4m_params = {
1717                         'hdcore': '3.2.0',
1718                         'plugin': 'flowplayer-3.2.0.1',
1719                     }
1720                 f4m_url += '&' if '?' in f4m_url else '?'
1721                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1722                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1723                 continue
1724
1725             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1726                 http_count += 1
1727                 formats.append({
1728                     'url': src_url,
1729                     'ext': ext or src_ext or 'flv',
1730                     'format_id': 'http-%d' % (bitrate or http_count),
1731                     'tbr': bitrate,
1732                     'filesize': filesize,
1733                     'width': width,
1734                     'height': height,
1735                 })
1736                 continue
1737
1738         return formats
1739
1740     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1741         urls = []
1742         subtitles = {}
1743         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1744             src = textstream.get('src')
1745             if not src or src in urls:
1746                 continue
1747             urls.append(src)
1748             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1749             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1750             subtitles.setdefault(lang, []).append({
1751                 'url': src,
1752                 'ext': ext,
1753             })
1754         return subtitles
1755
1756     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1757         xspf = self._download_xml(
1758             xspf_url, playlist_id, 'Downloading xpsf playlist',
1759             'Unable to download xspf manifest', fatal=fatal)
1760         if xspf is False:
1761             return []
1762         return self._parse_xspf(
1763             xspf, playlist_id, xspf_url=xspf_url,
1764             xspf_base_url=base_url(xspf_url))
1765
1766     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1767         NS_MAP = {
1768             'xspf': 'http://xspf.org/ns/0/',
1769             's1': 'http://static.streamone.nl/player/ns/0',
1770         }
1771
1772         entries = []
1773         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1774             title = xpath_text(
1775                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1776             description = xpath_text(
1777                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1778             thumbnail = xpath_text(
1779                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1780             duration = float_or_none(
1781                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1782
1783             formats = []
1784             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1785                 format_url = urljoin(xspf_base_url, location.text)
1786                 if not format_url:
1787                     continue
1788                 formats.append({
1789                     'url': format_url,
1790                     'manifest_url': xspf_url,
1791                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1792                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1793                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1794                 })
1795             self._sort_formats(formats)
1796
1797             entries.append({
1798                 'id': playlist_id,
1799                 'title': title,
1800                 'description': description,
1801                 'thumbnail': thumbnail,
1802                 'duration': duration,
1803                 'formats': formats,
1804             })
1805         return entries
1806
1807     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1808         res = self._download_xml_handle(
1809             mpd_url, video_id,
1810             note=note or 'Downloading MPD manifest',
1811             errnote=errnote or 'Failed to download MPD manifest',
1812             fatal=fatal)
1813         if res is False:
1814             return []
1815         mpd_doc, urlh = res
1816         mpd_base_url = base_url(urlh.geturl())
1817
1818         return self._parse_mpd_formats(
1819             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1820             formats_dict=formats_dict, mpd_url=mpd_url)
1821
1822     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1823         """
1824         Parse formats from MPD manifest.
1825         References:
1826          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1827             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1828          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1829         """
1830         if mpd_doc.get('type') == 'dynamic':
1831             return []
1832
1833         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1834
1835         def _add_ns(path):
1836             return self._xpath_ns(path, namespace)
1837
1838         def is_drm_protected(element):
1839             return element.find(_add_ns('ContentProtection')) is not None
1840
1841         def extract_multisegment_info(element, ms_parent_info):
1842             ms_info = ms_parent_info.copy()
1843
1844             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1845             # common attributes and elements.  We will only extract relevant
1846             # for us.
1847             def extract_common(source):
1848                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1849                 if segment_timeline is not None:
1850                     s_e = segment_timeline.findall(_add_ns('S'))
1851                     if s_e:
1852                         ms_info['total_number'] = 0
1853                         ms_info['s'] = []
1854                         for s in s_e:
1855                             r = int(s.get('r', 0))
1856                             ms_info['total_number'] += 1 + r
1857                             ms_info['s'].append({
1858                                 't': int(s.get('t', 0)),
1859                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1860                                 'd': int(s.attrib['d']),
1861                                 'r': r,
1862                             })
1863                 start_number = source.get('startNumber')
1864                 if start_number:
1865                     ms_info['start_number'] = int(start_number)
1866                 timescale = source.get('timescale')
1867                 if timescale:
1868                     ms_info['timescale'] = int(timescale)
1869                 segment_duration = source.get('duration')
1870                 if segment_duration:
1871                     ms_info['segment_duration'] = float(segment_duration)
1872
1873             def extract_Initialization(source):
1874                 initialization = source.find(_add_ns('Initialization'))
1875                 if initialization is not None:
1876                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1877
1878             segment_list = element.find(_add_ns('SegmentList'))
1879             if segment_list is not None:
1880                 extract_common(segment_list)
1881                 extract_Initialization(segment_list)
1882                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1883                 if segment_urls_e:
1884                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1885             else:
1886                 segment_template = element.find(_add_ns('SegmentTemplate'))
1887                 if segment_template is not None:
1888                     extract_common(segment_template)
1889                     media = segment_template.get('media')
1890                     if media:
1891                         ms_info['media'] = media
1892                     initialization = segment_template.get('initialization')
1893                     if initialization:
1894                         ms_info['initialization'] = initialization
1895                     else:
1896                         extract_Initialization(segment_template)
1897             return ms_info
1898
1899         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1900         formats = []
1901         for period in mpd_doc.findall(_add_ns('Period')):
1902             period_duration = parse_duration(period.get('duration')) or mpd_duration
1903             period_ms_info = extract_multisegment_info(period, {
1904                 'start_number': 1,
1905                 'timescale': 1,
1906             })
1907             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1908                 if is_drm_protected(adaptation_set):
1909                     continue
1910                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1911                 for representation in adaptation_set.findall(_add_ns('Representation')):
1912                     if is_drm_protected(representation):
1913                         continue
1914                     representation_attrib = adaptation_set.attrib.copy()
1915                     representation_attrib.update(representation.attrib)
1916                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1917                     mime_type = representation_attrib['mimeType']
1918                     content_type = mime_type.split('/')[0]
1919                     if content_type == 'text':
1920                         # TODO implement WebVTT downloading
1921                         pass
1922                     elif content_type in ('video', 'audio'):
1923                         base_url = ''
1924                         for element in (representation, adaptation_set, period, mpd_doc):
1925                             base_url_e = element.find(_add_ns('BaseURL'))
1926                             if base_url_e is not None:
1927                                 base_url = base_url_e.text + base_url
1928                                 if re.match(r'^https?://', base_url):
1929                                     break
1930                         if mpd_base_url and not re.match(r'^https?://', base_url):
1931                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1932                                 mpd_base_url += '/'
1933                             base_url = mpd_base_url + base_url
1934                         representation_id = representation_attrib.get('id')
1935                         lang = representation_attrib.get('lang')
1936                         url_el = representation.find(_add_ns('BaseURL'))
1937                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1938                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1939                         f = {
1940                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1941                             'url': base_url,
1942                             'manifest_url': mpd_url,
1943                             'ext': mimetype2ext(mime_type),
1944                             'width': int_or_none(representation_attrib.get('width')),
1945                             'height': int_or_none(representation_attrib.get('height')),
1946                             'tbr': float_or_none(bandwidth, 1000),
1947                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1948                             'fps': int_or_none(representation_attrib.get('frameRate')),
1949                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1950                             'format_note': 'DASH %s' % content_type,
1951                             'filesize': filesize,
1952                             'container': mimetype2ext(mime_type) + '_dash',
1953                         }
1954                         f.update(parse_codecs(representation_attrib.get('codecs')))
1955                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1956
1957                         def prepare_template(template_name, identifiers):
1958                             t = representation_ms_info[template_name]
1959                             t = t.replace('$RepresentationID$', representation_id)
1960                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1961                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1962                             t.replace('$$', '$')
1963                             return t
1964
1965                         # @initialization is a regular template like @media one
1966                         # so it should be handled just the same way (see
1967                         # https://github.com/rg3/youtube-dl/issues/11605)
1968                         if 'initialization' in representation_ms_info:
1969                             initialization_template = prepare_template(
1970                                 'initialization',
1971                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1972                                 # $Time$ shall not be included for @initialization thus
1973                                 # only $Bandwidth$ remains
1974                                 ('Bandwidth', ))
1975                             representation_ms_info['initialization_url'] = initialization_template % {
1976                                 'Bandwidth': bandwidth,
1977                             }
1978
1979                         def location_key(location):
1980                             return 'url' if re.match(r'^https?://', location) else 'path'
1981
1982                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1983
1984                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1985                             media_location_key = location_key(media_template)
1986
1987                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1988                             # can't be used at the same time
1989                             if '%(Number' in media_template and 's' not in representation_ms_info:
1990                                 segment_duration = None
1991                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1992                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1993                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1994                                 representation_ms_info['fragments'] = [{
1995                                     media_location_key: media_template % {
1996                                         'Number': segment_number,
1997                                         'Bandwidth': bandwidth,
1998                                     },
1999                                     'duration': segment_duration,
2000                                 } for segment_number in range(
2001                                     representation_ms_info['start_number'],
2002                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2003                             else:
2004                                 # $Number*$ or $Time$ in media template with S list available
2005                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2006                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2007                                 representation_ms_info['fragments'] = []
2008                                 segment_time = 0
2009                                 segment_d = None
2010                                 segment_number = representation_ms_info['start_number']
2011
2012                                 def add_segment_url():
2013                                     segment_url = media_template % {
2014                                         'Time': segment_time,
2015                                         'Bandwidth': bandwidth,
2016                                         'Number': segment_number,
2017                                     }
2018                                     representation_ms_info['fragments'].append({
2019                                         media_location_key: segment_url,
2020                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2021                                     })
2022
2023                                 for num, s in enumerate(representation_ms_info['s']):
2024                                     segment_time = s.get('t') or segment_time
2025                                     segment_d = s['d']
2026                                     add_segment_url()
2027                                     segment_number += 1
2028                                     for r in range(s.get('r', 0)):
2029                                         segment_time += segment_d
2030                                         add_segment_url()
2031                                         segment_number += 1
2032                                     segment_time += segment_d
2033                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2034                             # No media template
2035                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2036                             # or any YouTube dashsegments video
2037                             fragments = []
2038                             segment_index = 0
2039                             timescale = representation_ms_info['timescale']
2040                             for s in representation_ms_info['s']:
2041                                 duration = float_or_none(s['d'], timescale)
2042                                 for r in range(s.get('r', 0) + 1):
2043                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2044                                     fragments.append({
2045                                         location_key(segment_uri): segment_uri,
2046                                         'duration': duration,
2047                                     })
2048                                     segment_index += 1
2049                             representation_ms_info['fragments'] = fragments
2050                         elif 'segment_urls' in representation_ms_info:
2051                             # Segment URLs with no SegmentTimeline
2052                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2053                             # https://github.com/rg3/youtube-dl/pull/14844
2054                             fragments = []
2055                             segment_duration = float_or_none(
2056                                 representation_ms_info['segment_duration'],
2057                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2058                             for segment_url in representation_ms_info['segment_urls']:
2059                                 fragment = {
2060                                     location_key(segment_url): segment_url,
2061                                 }
2062                                 if segment_duration:
2063                                     fragment['duration'] = segment_duration
2064                                 fragments.append(fragment)
2065                             representation_ms_info['fragments'] = fragments
2066                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2067                         # No fragments key is present in this case.
2068                         if 'fragments' in representation_ms_info:
2069                             f.update({
2070                                 'fragment_base_url': base_url,
2071                                 'fragments': [],
2072                                 'protocol': 'http_dash_segments',
2073                             })
2074                             if 'initialization_url' in representation_ms_info:
2075                                 initialization_url = representation_ms_info['initialization_url']
2076                                 if not f.get('url'):
2077                                     f['url'] = initialization_url
2078                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2079                             f['fragments'].extend(representation_ms_info['fragments'])
2080                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2081                         # is not necessarily unique within a Period thus formats with
2082                         # the same `format_id` are quite possible. There are numerous examples
2083                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2084                         # https://github.com/rg3/youtube-dl/issues/13919)
2085                         full_info = formats_dict.get(representation_id, {}).copy()
2086                         full_info.update(f)
2087                         formats.append(full_info)
2088                     else:
2089                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2090         return formats
2091
2092     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2093         res = self._download_xml_handle(
2094             ism_url, video_id,
2095             note=note or 'Downloading ISM manifest',
2096             errnote=errnote or 'Failed to download ISM manifest',
2097             fatal=fatal)
2098         if res is False:
2099             return []
2100         ism_doc, urlh = res
2101
2102         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2103
2104     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2105         """
2106         Parse formats from ISM manifest.
2107         References:
2108          1. [MS-SSTR]: Smooth Streaming Protocol,
2109             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2110         """
2111         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2112             return []
2113
2114         duration = int(ism_doc.attrib['Duration'])
2115         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2116
2117         formats = []
2118         for stream in ism_doc.findall('StreamIndex'):
2119             stream_type = stream.get('Type')
2120             if stream_type not in ('video', 'audio'):
2121                 continue
2122             url_pattern = stream.attrib['Url']
2123             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2124             stream_name = stream.get('Name')
2125             for track in stream.findall('QualityLevel'):
2126                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2127                 # TODO: add support for WVC1 and WMAP
2128                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2129                     self.report_warning('%s is not a supported codec' % fourcc)
2130                     continue
2131                 tbr = int(track.attrib['Bitrate']) // 1000
2132                 # [1] does not mention Width and Height attributes. However,
2133                 # they're often present while MaxWidth and MaxHeight are
2134                 # missing, so should be used as fallbacks
2135                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2136                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2137                 sampling_rate = int_or_none(track.get('SamplingRate'))
2138
2139                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2140                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2141
2142                 fragments = []
2143                 fragment_ctx = {
2144                     'time': 0,
2145                 }
2146                 stream_fragments = stream.findall('c')
2147                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2148                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2149                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2150                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2151                     if not fragment_ctx['duration']:
2152                         try:
2153                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2154                         except IndexError:
2155                             next_fragment_time = duration
2156                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2157                     for _ in range(fragment_repeat):
2158                         fragments.append({
2159                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2160                             'duration': fragment_ctx['duration'] / stream_timescale,
2161                         })
2162                         fragment_ctx['time'] += fragment_ctx['duration']
2163
2164                 format_id = []
2165                 if ism_id:
2166                     format_id.append(ism_id)
2167                 if stream_name:
2168                     format_id.append(stream_name)
2169                 format_id.append(compat_str(tbr))
2170
2171                 formats.append({
2172                     'format_id': '-'.join(format_id),
2173                     'url': ism_url,
2174                     'manifest_url': ism_url,
2175                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2176                     'width': width,
2177                     'height': height,
2178                     'tbr': tbr,
2179                     'asr': sampling_rate,
2180                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2181                     'acodec': 'none' if stream_type == 'video' else fourcc,
2182                     'protocol': 'ism',
2183                     'fragments': fragments,
2184                     '_download_params': {
2185                         'duration': duration,
2186                         'timescale': stream_timescale,
2187                         'width': width or 0,
2188                         'height': height or 0,
2189                         'fourcc': fourcc,
2190                         'codec_private_data': track.get('CodecPrivateData'),
2191                         'sampling_rate': sampling_rate,
2192                         'channels': int_or_none(track.get('Channels', 2)),
2193                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2194                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2195                     },
2196                 })
2197         return formats
2198
2199     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2200         def absolute_url(item_url):
2201             return urljoin(base_url, item_url)
2202
2203         def parse_content_type(content_type):
2204             if not content_type:
2205                 return {}
2206             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2207             if ctr:
2208                 mimetype, codecs = ctr.groups()
2209                 f = parse_codecs(codecs)
2210                 f['ext'] = mimetype2ext(mimetype)
2211                 return f
2212             return {}
2213
2214         def _media_formats(src, cur_media_type, type_info={}):
2215             full_url = absolute_url(src)
2216             ext = type_info.get('ext') or determine_ext(full_url)
2217             if ext == 'm3u8':
2218                 is_plain_url = False
2219                 formats = self._extract_m3u8_formats(
2220                     full_url, video_id, ext='mp4',
2221                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2222                     preference=preference, fatal=False)
2223             elif ext == 'mpd':
2224                 is_plain_url = False
2225                 formats = self._extract_mpd_formats(
2226                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2227             else:
2228                 is_plain_url = True
2229                 formats = [{
2230                     'url': full_url,
2231                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2232                 }]
2233             return is_plain_url, formats
2234
2235         entries = []
2236         # amp-video and amp-audio are very similar to their HTML5 counterparts
2237         # so we wll include them right here (see
2238         # https://www.ampproject.org/docs/reference/components/amp-video)
2239         media_tags = [(media_tag, media_type, '')
2240                       for media_tag, media_type
2241                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2242         media_tags.extend(re.findall(
2243             # We only allow video|audio followed by a whitespace or '>'.
2244             # Allowing more characters may end up in significant slow down (see
2245             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2246             # http://www.porntrex.com/maps/videositemap.xml).
2247             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2248         for media_tag, media_type, media_content in media_tags:
2249             media_info = {
2250                 'formats': [],
2251                 'subtitles': {},
2252             }
2253             media_attributes = extract_attributes(media_tag)
2254             src = media_attributes.get('src')
2255             if src:
2256                 _, formats = _media_formats(src, media_type)
2257                 media_info['formats'].extend(formats)
2258             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2259             if media_content:
2260                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2261                     source_attributes = extract_attributes(source_tag)
2262                     src = source_attributes.get('src')
2263                     if not src:
2264                         continue
2265                     f = parse_content_type(source_attributes.get('type'))
2266                     is_plain_url, formats = _media_formats(src, media_type, f)
2267                     if is_plain_url:
2268                         # res attribute is not standard but seen several times
2269                         # in the wild
2270                         f.update({
2271                             'height': int_or_none(source_attributes.get('res')),
2272                             'format_id': source_attributes.get('label'),
2273                         })
2274                         f.update(formats[0])
2275                         media_info['formats'].append(f)
2276                     else:
2277                         media_info['formats'].extend(formats)
2278                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2279                     track_attributes = extract_attributes(track_tag)
2280                     kind = track_attributes.get('kind')
2281                     if not kind or kind in ('subtitles', 'captions'):
2282                         src = track_attributes.get('src')
2283                         if not src:
2284                             continue
2285                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2286                         media_info['subtitles'].setdefault(lang, []).append({
2287                             'url': absolute_url(src),
2288                         })
2289             if media_info['formats'] or media_info['subtitles']:
2290                 entries.append(media_info)
2291         return entries
2292
2293     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2294         formats = []
2295         hdcore_sign = 'hdcore=3.7.0'
2296         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2297         hds_host = hosts.get('hds')
2298         if hds_host:
2299             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2300         if 'hdcore=' not in f4m_url:
2301             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2302         f4m_formats = self._extract_f4m_formats(
2303             f4m_url, video_id, f4m_id='hds', fatal=False)
2304         for entry in f4m_formats:
2305             entry.update({'extra_param_to_segment_url': hdcore_sign})
2306         formats.extend(f4m_formats)
2307         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2308         hls_host = hosts.get('hls')
2309         if hls_host:
2310             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2311         formats.extend(self._extract_m3u8_formats(
2312             m3u8_url, video_id, 'mp4', 'm3u8_native',
2313             m3u8_id='hls', fatal=False))
2314         return formats
2315
2316     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2317         query = compat_urlparse.urlparse(url).query
2318         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2319         mobj = re.search(
2320             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2321         url_base = mobj.group('url')
2322         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2323         formats = []
2324
2325         def manifest_url(manifest):
2326             m_url = '%s/%s' % (http_base_url, manifest)
2327             if query:
2328                 m_url += '?%s' % query
2329             return m_url
2330
2331         if 'm3u8' not in skip_protocols:
2332             formats.extend(self._extract_m3u8_formats(
2333                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2334                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2335         if 'f4m' not in skip_protocols:
2336             formats.extend(self._extract_f4m_formats(
2337                 manifest_url('manifest.f4m'),
2338                 video_id, f4m_id='hds', fatal=False))
2339         if 'dash' not in skip_protocols:
2340             formats.extend(self._extract_mpd_formats(
2341                 manifest_url('manifest.mpd'),
2342                 video_id, mpd_id='dash', fatal=False))
2343         if re.search(r'(?:/smil:|\.smil)', url_base):
2344             if 'smil' not in skip_protocols:
2345                 rtmp_formats = self._extract_smil_formats(
2346                     manifest_url('jwplayer.smil'),
2347                     video_id, fatal=False)
2348                 for rtmp_format in rtmp_formats:
2349                     rtsp_format = rtmp_format.copy()
2350                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2351                     del rtsp_format['play_path']
2352                     del rtsp_format['ext']
2353                     rtsp_format.update({
2354                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2355                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2356                         'protocol': 'rtsp',
2357                     })
2358                     formats.extend([rtmp_format, rtsp_format])
2359         else:
2360             for protocol in ('rtmp', 'rtsp'):
2361                 if protocol not in skip_protocols:
2362                     formats.append({
2363                         'url': '%s:%s' % (protocol, url_base),
2364                         'format_id': protocol,
2365                         'protocol': protocol,
2366                     })
2367         return formats
2368
2369     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2370         mobj = re.search(
2371             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2372             webpage)
2373         if mobj:
2374             try:
2375                 jwplayer_data = self._parse_json(mobj.group('options'),
2376                                                  video_id=video_id,
2377                                                  transform_source=transform_source)
2378             except ExtractorError:
2379                 pass
2380             else:
2381                 if isinstance(jwplayer_data, dict):
2382                     return jwplayer_data
2383
2384     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2385         jwplayer_data = self._find_jwplayer_data(
2386             webpage, video_id, transform_source=js_to_json)
2387         return self._parse_jwplayer_data(
2388             jwplayer_data, video_id, *args, **kwargs)
2389
2390     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2391                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2392         # JWPlayer backward compatibility: flattened playlists
2393         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2394         if 'playlist' not in jwplayer_data:
2395             jwplayer_data = {'playlist': [jwplayer_data]}
2396
2397         entries = []
2398
2399         # JWPlayer backward compatibility: single playlist item
2400         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2401         if not isinstance(jwplayer_data['playlist'], list):
2402             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2403
2404         for video_data in jwplayer_data['playlist']:
2405             # JWPlayer backward compatibility: flattened sources
2406             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2407             if 'sources' not in video_data:
2408                 video_data['sources'] = [video_data]
2409
2410             this_video_id = video_id or video_data['mediaid']
2411
2412             formats = self._parse_jwplayer_formats(
2413                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2414                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2415
2416             subtitles = {}
2417             tracks = video_data.get('tracks')
2418             if tracks and isinstance(tracks, list):
2419                 for track in tracks:
2420                     if not isinstance(track, dict):
2421                         continue
2422                     track_kind = track.get('kind')
2423                     if not track_kind or not isinstance(track_kind, compat_str):
2424                         continue
2425                     if track_kind.lower() not in ('captions', 'subtitles'):
2426                         continue
2427                     track_url = urljoin(base_url, track.get('file'))
2428                     if not track_url:
2429                         continue
2430                     subtitles.setdefault(track.get('label') or 'en', []).append({
2431                         'url': self._proto_relative_url(track_url)
2432                     })
2433
2434             entry = {
2435                 'id': this_video_id,
2436                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2437                 'description': video_data.get('description'),
2438                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2439                 'timestamp': int_or_none(video_data.get('pubdate')),
2440                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2441                 'subtitles': subtitles,
2442             }
2443             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2444             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2445                 entry.update({
2446                     '_type': 'url_transparent',
2447                     'url': formats[0]['url'],
2448                 })
2449             else:
2450                 self._sort_formats(formats)
2451                 entry['formats'] = formats
2452             entries.append(entry)
2453         if len(entries) == 1:
2454             return entries[0]
2455         else:
2456             return self.playlist_result(entries)
2457
2458     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2459                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2460         urls = []
2461         formats = []
2462         for source in jwplayer_sources_data:
2463             if not isinstance(source, dict):
2464                 continue
2465             source_url = self._proto_relative_url(source.get('file'))
2466             if not source_url:
2467                 continue
2468             if base_url:
2469                 source_url = compat_urlparse.urljoin(base_url, source_url)
2470             if source_url in urls:
2471                 continue
2472             urls.append(source_url)
2473             source_type = source.get('type') or ''
2474             ext = mimetype2ext(source_type) or determine_ext(source_url)
2475             if source_type == 'hls' or ext == 'm3u8':
2476                 formats.extend(self._extract_m3u8_formats(
2477                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2478                     m3u8_id=m3u8_id, fatal=False))
2479             elif source_type == 'dash' or ext == 'mpd':
2480                 formats.extend(self._extract_mpd_formats(
2481                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2482             elif ext == 'smil':
2483                 formats.extend(self._extract_smil_formats(
2484                     source_url, video_id, fatal=False))
2485             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2486             elif source_type.startswith('audio') or ext in (
2487                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2488                 formats.append({
2489                     'url': source_url,
2490                     'vcodec': 'none',
2491                     'ext': ext,
2492                 })
2493             else:
2494                 height = int_or_none(source.get('height'))
2495                 if height is None:
2496                     # Often no height is provided but there is a label in
2497                     # format like "1080p", "720p SD", or 1080.
2498                     height = int_or_none(self._search_regex(
2499                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2500                         'height', default=None))
2501                 a_format = {
2502                     'url': source_url,
2503                     'width': int_or_none(source.get('width')),
2504                     'height': height,
2505                     'tbr': int_or_none(source.get('bitrate')),
2506                     'ext': ext,
2507                 }
2508                 if source_url.startswith('rtmp'):
2509                     a_format['ext'] = 'flv'
2510                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2511                     # of jwplayer.flash.swf
2512                     rtmp_url_parts = re.split(
2513                         r'((?:mp4|mp3|flv):)', source_url, 1)
2514                     if len(rtmp_url_parts) == 3:
2515                         rtmp_url, prefix, play_path = rtmp_url_parts
2516                         a_format.update({
2517                             'url': rtmp_url,
2518                             'play_path': prefix + play_path,
2519                         })
2520                     if rtmp_params:
2521                         a_format.update(rtmp_params)
2522                 formats.append(a_format)
2523         return formats
2524
2525     def _live_title(self, name):
2526         """ Generate the title for a live video """
2527         now = datetime.datetime.now()
2528         now_str = now.strftime('%Y-%m-%d %H:%M')
2529         return name + ' ' + now_str
2530
2531     def _int(self, v, name, fatal=False, **kwargs):
2532         res = int_or_none(v, **kwargs)
2533         if 'get_attr' in kwargs:
2534             print(getattr(v, kwargs['get_attr']))
2535         if res is None:
2536             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2537             if fatal:
2538                 raise ExtractorError(msg)
2539             else:
2540                 self._downloader.report_warning(msg)
2541         return res
2542
2543     def _float(self, v, name, fatal=False, **kwargs):
2544         res = float_or_none(v, **kwargs)
2545         if res is None:
2546             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2547             if fatal:
2548                 raise ExtractorError(msg)
2549             else:
2550                 self._downloader.report_warning(msg)
2551         return res
2552
2553     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2554                     path='/', secure=False, discard=False, rest={}, **kwargs):
2555         cookie = compat_cookiejar.Cookie(
2556             0, name, value, port, port is not None, domain, True,
2557             domain.startswith('.'), path, True, secure, expire_time,
2558             discard, None, None, rest)
2559         self._downloader.cookiejar.set_cookie(cookie)
2560
2561     def _get_cookies(self, url):
2562         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2563         req = sanitized_Request(url)
2564         self._downloader.cookiejar.add_cookie_header(req)
2565         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2566
2567     def get_testcases(self, include_onlymatching=False):
2568         t = getattr(self, '_TEST', None)
2569         if t:
2570             assert not hasattr(self, '_TESTS'), \
2571                 '%s has _TEST and _TESTS' % type(self).__name__
2572             tests = [t]
2573         else:
2574             tests = getattr(self, '_TESTS', [])
2575         for t in tests:
2576             if not include_onlymatching and t.get('only_matching', False):
2577                 continue
2578             t['name'] = type(self).__name__[:-len('IE')]
2579             yield t
2580
2581     def is_suitable(self, age_limit):
2582         """ Test whether the extractor is generally suitable for the given
2583         age limit (i.e. pornographic sites are not, all others usually are) """
2584
2585         any_restricted = False
2586         for tc in self.get_testcases(include_onlymatching=False):
2587             if tc.get('playlist', []):
2588                 tc = tc['playlist'][0]
2589             is_restricted = age_restricted(
2590                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2591             if not is_restricted:
2592                 return True
2593             any_restricted = any_restricted or is_restricted
2594         return not any_restricted
2595
2596     def extract_subtitles(self, *args, **kwargs):
2597         if (self._downloader.params.get('writesubtitles', False) or
2598                 self._downloader.params.get('listsubtitles')):
2599             return self._get_subtitles(*args, **kwargs)
2600         return {}
2601
2602     def _get_subtitles(self, *args, **kwargs):
2603         raise NotImplementedError('This method must be implemented by subclasses')
2604
2605     @staticmethod
2606     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2607         """ Merge subtitle items for one language. Items with duplicated URLs
2608         will be dropped. """
2609         list1_urls = set([item['url'] for item in subtitle_list1])
2610         ret = list(subtitle_list1)
2611         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2612         return ret
2613
2614     @classmethod
2615     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2616         """ Merge two subtitle dictionaries, language by language. """
2617         ret = dict(subtitle_dict1)
2618         for lang in subtitle_dict2:
2619             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2620         return ret
2621
2622     def extract_automatic_captions(self, *args, **kwargs):
2623         if (self._downloader.params.get('writeautomaticsub', False) or
2624                 self._downloader.params.get('listsubtitles')):
2625             return self._get_automatic_captions(*args, **kwargs)
2626         return {}
2627
2628     def _get_automatic_captions(self, *args, **kwargs):
2629         raise NotImplementedError('This method must be implemented by subclasses')
2630
2631     def mark_watched(self, *args, **kwargs):
2632         if (self._downloader.params.get('mark_watched', False) and
2633                 (self._get_login_info()[0] is not None or
2634                     self._downloader.params.get('cookiefile') is not None)):
2635             self._mark_watched(*args, **kwargs)
2636
2637     def _mark_watched(self, *args, **kwargs):
2638         raise NotImplementedError('This method must be implemented by subclasses')
2639
2640     def geo_verification_headers(self):
2641         headers = {}
2642         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2643         if geo_verification_proxy:
2644             headers['Ytdl-request-proxy'] = geo_verification_proxy
2645         return headers
2646
2647     def _generic_id(self, url):
2648         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2649
2650     def _generic_title(self, url):
2651         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2652
2653
2654 class SearchInfoExtractor(InfoExtractor):
2655     """
2656     Base class for paged search queries extractors.
2657     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2658     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2659     """
2660
2661     @classmethod
2662     def _make_valid_url(cls):
2663         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2664
2665     @classmethod
2666     def suitable(cls, url):
2667         return re.match(cls._make_valid_url(), url) is not None
2668
2669     def _real_extract(self, query):
2670         mobj = re.match(self._make_valid_url(), query)
2671         if mobj is None:
2672             raise ExtractorError('Invalid search query "%s"' % query)
2673
2674         prefix = mobj.group('prefix')
2675         query = mobj.group('query')
2676         if prefix == '':
2677             return self._get_n_results(query, 1)
2678         elif prefix == 'all':
2679             return self._get_n_results(query, self._MAX_RESULTS)
2680         else:
2681             n = int(prefix)
2682             if n <= 0:
2683                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2684             elif n > self._MAX_RESULTS:
2685                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2686                 n = self._MAX_RESULTS
2687             return self._get_n_results(query, n)
2688
2689     def _get_n_results(self, query, n):
2690         """Get a specified number of results for a query"""
2691         raise NotImplementedError('This method must be implemented by subclasses')
2692
2693     @property
2694     def SEARCH_KEY(self):
2695         return self._SEARCH_KEY