Merge branch 'master' into openload-phantomjs-method
[youtube-dl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_http_client,
23     compat_os_name,
24     compat_str,
25     compat_urllib_error,
26     compat_urllib_parse_unquote,
27     compat_urllib_parse_urlencode,
28     compat_urllib_request,
29     compat_urlparse,
30 )
31 from ..downloader.f4m import remove_encrypted_media
32 from ..utils import (
33     NO_DEFAULT,
34     age_restricted,
35     base_url,
36     bug_reports_message,
37     clean_html,
38     compiled_regex_type,
39     determine_ext,
40     determine_protocol,
41     error_to_compat_str,
42     ExtractorError,
43     extract_attributes,
44     fix_xml_ampersands,
45     float_or_none,
46     GeoRestrictedError,
47     GeoUtils,
48     int_or_none,
49     js_to_json,
50     mimetype2ext,
51     orderedSet,
52     parse_codecs,
53     parse_duration,
54     parse_iso8601,
55     parse_m3u8_attributes,
56     RegexNotFoundError,
57     sanitized_Request,
58     sanitize_filename,
59     unescapeHTML,
60     unified_strdate,
61     unified_timestamp,
62     update_Request,
63     update_url_query,
64     urljoin,
65     url_basename,
66     xpath_element,
67     xpath_text,
68     xpath_with_ns,
69 )
70
71
72 class InfoExtractor(object):
73     """Information Extractor class.
74
75     Information extractors are the classes that, given a URL, extract
76     information about the video (or videos) the URL refers to. This
77     information includes the real video URL, the video title, author and
78     others. The information is stored in a dictionary which is then
79     passed to the YoutubeDL. The YoutubeDL processes this
80     information possibly downloading the video to the file system, among
81     other possible outcomes.
82
83     The type field determines the type of the result.
84     By far the most common value (and the default if _type is missing) is
85     "video", which indicates a single video.
86
87     For a video, the dictionaries must include the following fields:
88
89     id:             Video identifier.
90     title:          Video title, unescaped.
91
92     Additionally, it must contain either a formats entry or a url one:
93
94     formats:        A list of dictionaries for each format available, ordered
95                     from worst to best quality.
96
97                     Potential fields:
98                     * url        Mandatory. The URL of the video file
99                     * manifest_url
100                                  The URL of the manifest file in case of
101                                  fragmented media (DASH, hls, hds)
102                     * ext        Will be calculated from URL if missing
103                     * format     A human-readable description of the format
104                                  ("mp4 container with h264/opus").
105                                  Calculated from the format_id, width, height.
106                                  and format_note fields if missing.
107                     * format_id  A short description of the format
108                                  ("mp4_h264_opus" or "19").
109                                 Technically optional, but strongly recommended.
110                     * format_note Additional info about the format
111                                  ("3D" or "DASH video")
112                     * width      Width of the video, if known
113                     * height     Height of the video, if known
114                     * resolution Textual description of width and height
115                     * tbr        Average bitrate of audio and video in KBit/s
116                     * abr        Average audio bitrate in KBit/s
117                     * acodec     Name of the audio codec in use
118                     * asr        Audio sampling rate in Hertz
119                     * vbr        Average video bitrate in KBit/s
120                     * fps        Frame rate
121                     * vcodec     Name of the video codec in use
122                     * container  Name of the container format
123                     * filesize   The number of bytes, if known in advance
124                     * filesize_approx  An estimate for the number of bytes
125                     * player_url SWF Player URL (used for rtmpdump).
126                     * protocol   The protocol that will be used for the actual
127                                  download, lower-case.
128                                  "http", "https", "rtsp", "rtmp", "rtmpe",
129                                  "m3u8", "m3u8_native" or "http_dash_segments".
130                     * fragment_base_url
131                                  Base URL for fragments. Each fragment's path
132                                  value (if present) will be relative to
133                                  this URL.
134                     * fragments  A list of fragments of a fragmented media.
135                                  Each fragment entry must contain either an url
136                                  or a path. If an url is present it should be
137                                  considered by a client. Otherwise both path and
138                                  fragment_base_url must be present. Here is
139                                  the list of all potential fields:
140                                  * "url" - fragment's URL
141                                  * "path" - fragment's path relative to
142                                             fragment_base_url
143                                  * "duration" (optional, int or float)
144                                  * "filesize" (optional, int)
145                     * preference Order number of this format. If this field is
146                                  present and not None, the formats get sorted
147                                  by this field, regardless of all other values.
148                                  -1 for default (order by other properties),
149                                  -2 or smaller for less than default.
150                                  < -1000 to hide the format (if there is
151                                     another one which is strictly better)
152                     * language   Language code, e.g. "de" or "en-US".
153                     * language_preference  Is this in the language mentioned in
154                                  the URL?
155                                  10 if it's what the URL is about,
156                                  -1 for default (don't know),
157                                  -10 otherwise, other values reserved for now.
158                     * quality    Order number of the video quality of this
159                                  format, irrespective of the file format.
160                                  -1 for default (order by other properties),
161                                  -2 or smaller for less than default.
162                     * source_preference  Order number for this video source
163                                   (quality takes higher priority)
164                                  -1 for default (order by other properties),
165                                  -2 or smaller for less than default.
166                     * http_headers  A dictionary of additional HTTP headers
167                                  to add to the request.
168                     * stretched_ratio  If given and not 1, indicates that the
169                                  video's pixels are not square.
170                                  width : height ratio as float.
171                     * no_resume  The server does not support resuming the
172                                  (HTTP or RTMP) download. Boolean.
173
174     url:            Final video URL.
175     ext:            Video filename extension.
176     format:         The video format, defaults to ext (used for --get-format)
177     player_url:     SWF Player URL (used for rtmpdump).
178
179     The following fields are optional:
180
181     alt_title:      A secondary title of the video.
182     display_id      An alternative identifier for the video, not necessarily
183                     unique, but available before title. Typically, id is
184                     something like "4234987", title "Dancing naked mole rats",
185                     and display_id "dancing-naked-mole-rats"
186     thumbnails:     A list of dictionaries, with the following entries:
187                         * "id" (optional, string) - Thumbnail format ID
188                         * "url"
189                         * "preference" (optional, int) - quality of the image
190                         * "width" (optional, int)
191                         * "height" (optional, int)
192                         * "resolution" (optional, string "{width}x{height"},
193                                         deprecated)
194                         * "filesize" (optional, int)
195     thumbnail:      Full URL to a video thumbnail image.
196     description:    Full video description.
197     uploader:       Full name of the video uploader.
198     license:        License name the video is licensed under.
199     creator:        The creator of the video.
200     release_date:   The date (YYYYMMDD) when the video was released.
201     timestamp:      UNIX timestamp of the moment the video became available.
202     upload_date:    Video upload date (YYYYMMDD).
203                     If not explicitly set, calculated from timestamp.
204     uploader_id:    Nickname or id of the video uploader.
205     uploader_url:   Full URL to a personal webpage of the video uploader.
206     location:       Physical location where the video was filmed.
207     subtitles:      The available subtitles as a dictionary in the format
208                     {tag: subformats}. "tag" is usually a language code, and
209                     "subformats" is a list sorted from lower to higher
210                     preference, each element is a dictionary with the "ext"
211                     entry and one of:
212                         * "data": The subtitles file contents
213                         * "url": A URL pointing to the subtitles file
214                     "ext" will be calculated from URL if missing
215     automatic_captions: Like 'subtitles', used by the YoutubeIE for
216                     automatically generated captions
217     duration:       Length of the video in seconds, as an integer or float.
218     view_count:     How many users have watched the video on the platform.
219     like_count:     Number of positive ratings of the video
220     dislike_count:  Number of negative ratings of the video
221     repost_count:   Number of reposts of the video
222     average_rating: Average rating give by users, the scale used depends on the webpage
223     comment_count:  Number of comments on the video
224     comments:       A list of comments, each with one or more of the following
225                     properties (all but one of text or html optional):
226                         * "author" - human-readable name of the comment author
227                         * "author_id" - user ID of the comment author
228                         * "id" - Comment ID
229                         * "html" - Comment as HTML
230                         * "text" - Plain text of the comment
231                         * "timestamp" - UNIX timestamp of comment
232                         * "parent" - ID of the comment this one is replying to.
233                                      Set to "root" to indicate that this is a
234                                      comment to the original video.
235     age_limit:      Age restriction for the video, as an integer (years)
236     webpage_url:    The URL to the video webpage, if given to youtube-dl it
237                     should allow to get the same result again. (It will be set
238                     by YoutubeDL if it's missing)
239     categories:     A list of categories that the video falls in, for example
240                     ["Sports", "Berlin"]
241     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
242     is_live:        True, False, or None (=unknown). Whether this video is a
243                     live stream that goes on instead of a fixed-length video.
244     start_time:     Time in seconds where the reproduction should start, as
245                     specified in the URL.
246     end_time:       Time in seconds where the reproduction should end, as
247                     specified in the URL.
248     chapters:       A list of dictionaries, with the following entries:
249                         * "start_time" - The start time of the chapter in seconds
250                         * "end_time" - The end time of the chapter in seconds
251                         * "title" (optional, string)
252
253     The following fields should only be used when the video belongs to some logical
254     chapter or section:
255
256     chapter:        Name or title of the chapter the video belongs to.
257     chapter_number: Number of the chapter the video belongs to, as an integer.
258     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
259
260     The following fields should only be used when the video is an episode of some
261     series, programme or podcast:
262
263     series:         Title of the series or programme the video episode belongs to.
264     season:         Title of the season the video episode belongs to.
265     season_number:  Number of the season the video episode belongs to, as an integer.
266     season_id:      Id of the season the video episode belongs to, as a unicode string.
267     episode:        Title of the video episode. Unlike mandatory video title field,
268                     this field should denote the exact title of the video episode
269                     without any kind of decoration.
270     episode_number: Number of the video episode within a season, as an integer.
271     episode_id:     Id of the video episode, as a unicode string.
272
273     The following fields should only be used when the media is a track or a part of
274     a music album:
275
276     track:          Title of the track.
277     track_number:   Number of the track within an album or a disc, as an integer.
278     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
279                     as a unicode string.
280     artist:         Artist(s) of the track.
281     genre:          Genre(s) of the track.
282     album:          Title of the album the track belongs to.
283     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
284     album_artist:   List of all artists appeared on the album (e.g.
285                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
286                     and compilations).
287     disc_number:    Number of the disc or other physical medium the track belongs to,
288                     as an integer.
289     release_year:   Year (YYYY) when the album was released.
290
291     Unless mentioned otherwise, the fields should be Unicode strings.
292
293     Unless mentioned otherwise, None is equivalent to absence of information.
294
295
296     _type "playlist" indicates multiple videos.
297     There must be a key "entries", which is a list, an iterable, or a PagedList
298     object, each element of which is a valid dictionary by this specification.
299
300     Additionally, playlists can have "title", "description" and "id" attributes
301     with the same semantics as videos (see above).
302
303
304     _type "multi_video" indicates that there are multiple videos that
305     form a single show, for examples multiple acts of an opera or TV episode.
306     It must have an entries key like a playlist and contain all the keys
307     required for a video at the same time.
308
309
310     _type "url" indicates that the video must be extracted from another
311     location, possibly by a different extractor. Its only required key is:
312     "url" - the next URL to extract.
313     The key "ie_key" can be set to the class name (minus the trailing "IE",
314     e.g. "Youtube") if the extractor class is known in advance.
315     Additionally, the dictionary may have any properties of the resolved entity
316     known in advance, for example "title" if the title of the referred video is
317     known ahead of time.
318
319
320     _type "url_transparent" entities have the same specification as "url", but
321     indicate that the given additional information is more precise than the one
322     associated with the resolved URL.
323     This is useful when a site employs a video service that hosts the video and
324     its technical metadata, but that video service does not embed a useful
325     title, description etc.
326
327
328     Subclasses of this one should re-define the _real_initialize() and
329     _real_extract() methods and define a _VALID_URL regexp.
330     Probably, they should also be added to the list of extractors.
331
332     _GEO_BYPASS attribute may be set to False in order to disable
333     geo restriction bypass mechanisms for a particular extractor.
334     Though it won't disable explicit geo restriction bypass based on
335     country code provided with geo_bypass_country. (experimental)
336
337     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
338     countries for this extractor. One of these countries will be used by
339     geo restriction bypass mechanism right away in order to bypass
340     geo restriction, of course, if the mechanism is not disabled. (experimental)
341
342     NB: both these geo attributes are experimental and may change in future
343     or be completely removed.
344
345     Finally, the _WORKING attribute should be set to False for broken IEs
346     in order to warn the users and skip the tests.
347     """
348
349     _ready = False
350     _downloader = None
351     _x_forwarded_for_ip = None
352     _GEO_BYPASS = True
353     _GEO_COUNTRIES = None
354     _WORKING = True
355
356     def __init__(self, downloader=None):
357         """Constructor. Receives an optional downloader."""
358         self._ready = False
359         self._x_forwarded_for_ip = None
360         self.set_downloader(downloader)
361
362     @classmethod
363     def suitable(cls, url):
364         """Receives a URL and returns True if suitable for this IE."""
365
366         # This does not use has/getattr intentionally - we want to know whether
367         # we have cached the regexp for *this* class, whereas getattr would also
368         # match the superclass
369         if '_VALID_URL_RE' not in cls.__dict__:
370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
371         return cls._VALID_URL_RE.match(url) is not None
372
373     @classmethod
374     def _match_id(cls, url):
375         if '_VALID_URL_RE' not in cls.__dict__:
376             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
377         m = cls._VALID_URL_RE.match(url)
378         assert m
379         return m.group('id')
380
381     @classmethod
382     def working(cls):
383         """Getter method for _WORKING."""
384         return cls._WORKING
385
386     def initialize(self):
387         """Initializes an instance (authentication, etc)."""
388         self._initialize_geo_bypass(self._GEO_COUNTRIES)
389         if not self._ready:
390             self._real_initialize()
391             self._ready = True
392
393     def _initialize_geo_bypass(self, countries):
394         """
395         Initialize geo restriction bypass mechanism.
396
397         This method is used to initialize geo bypass mechanism based on faking
398         X-Forwarded-For HTTP header. A random country from provided country list
399         is selected and a random IP belonging to this country is generated. This
400         IP will be passed as X-Forwarded-For HTTP header in all subsequent
401         HTTP requests.
402
403         This method will be used for initial geo bypass mechanism initialization
404         during the instance initialization with _GEO_COUNTRIES.
405
406         You may also manually call it from extractor's code if geo countries
407         information is not available beforehand (e.g. obtained during
408         extraction) or due to some another reason.
409         """
410         if not self._x_forwarded_for_ip:
411             country_code = self._downloader.params.get('geo_bypass_country', None)
412             # If there is no explicit country for geo bypass specified and
413             # the extractor is known to be geo restricted let's fake IP
414             # as X-Forwarded-For right away.
415             if (not country_code and
416                     self._GEO_BYPASS and
417                     self._downloader.params.get('geo_bypass', True) and
418                     countries):
419                 country_code = random.choice(countries)
420             if country_code:
421                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
422                 if self._downloader.params.get('verbose', False):
423                     self._downloader.to_stdout(
424                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
425                         % (self._x_forwarded_for_ip, country_code.upper()))
426
427     def extract(self, url):
428         """Extracts URL information and returns it in list of dicts."""
429         try:
430             for _ in range(2):
431                 try:
432                     self.initialize()
433                     ie_result = self._real_extract(url)
434                     if self._x_forwarded_for_ip:
435                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
436                     return ie_result
437                 except GeoRestrictedError as e:
438                     if self.__maybe_fake_ip_and_retry(e.countries):
439                         continue
440                     raise
441         except ExtractorError:
442             raise
443         except compat_http_client.IncompleteRead as e:
444             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
445         except (KeyError, StopIteration) as e:
446             raise ExtractorError('An extractor error has occurred.', cause=e)
447
448     def __maybe_fake_ip_and_retry(self, countries):
449         if (not self._downloader.params.get('geo_bypass_country', None) and
450                 self._GEO_BYPASS and
451                 self._downloader.params.get('geo_bypass', True) and
452                 not self._x_forwarded_for_ip and
453                 countries):
454             country_code = random.choice(countries)
455             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
456             if self._x_forwarded_for_ip:
457                 self.report_warning(
458                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
459                     % (self._x_forwarded_for_ip, country_code.upper()))
460                 return True
461         return False
462
463     def set_downloader(self, downloader):
464         """Sets the downloader for this IE."""
465         self._downloader = downloader
466
467     def _real_initialize(self):
468         """Real initialization process. Redefine in subclasses."""
469         pass
470
471     def _real_extract(self, url):
472         """Real extraction process. Redefine in subclasses."""
473         pass
474
475     @classmethod
476     def ie_key(cls):
477         """A string for getting the InfoExtractor with get_info_extractor"""
478         return compat_str(cls.__name__[:-2])
479
480     @property
481     def IE_NAME(self):
482         return compat_str(type(self).__name__[:-2])
483
484     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
485         """ Returns the response handle """
486         if note is None:
487             self.report_download_webpage(video_id)
488         elif note is not False:
489             if video_id is None:
490                 self.to_screen('%s' % (note,))
491             else:
492                 self.to_screen('%s: %s' % (video_id, note))
493         if isinstance(url_or_request, compat_urllib_request.Request):
494             url_or_request = update_Request(
495                 url_or_request, data=data, headers=headers, query=query)
496         else:
497             if query:
498                 url_or_request = update_url_query(url_or_request, query)
499             if data is not None or headers:
500                 url_or_request = sanitized_Request(url_or_request, data, headers)
501         try:
502             return self._downloader.urlopen(url_or_request)
503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504             if errnote is False:
505                 return False
506             if errnote is None:
507                 errnote = 'Unable to download webpage'
508
509             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
510             if fatal:
511                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
512             else:
513                 self._downloader.report_warning(errmsg)
514                 return False
515
516     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
517         """ Returns a tuple (page content as string, URL handle) """
518         # Strip hashes from the URL (#1038)
519         if isinstance(url_or_request, (compat_str, str)):
520             url_or_request = url_or_request.partition('#')[0]
521
522         # Some sites check X-Forwarded-For HTTP header in order to figure out
523         # the origin of the client behind proxy. This allows bypassing geo
524         # restriction by faking this header's value to IP that belongs to some
525         # geo unrestricted country. We will do so once we encounter any
526         # geo restriction error.
527         if self._x_forwarded_for_ip:
528             if 'X-Forwarded-For' not in headers:
529                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
530
531         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
532         if urlh is False:
533             assert not fatal
534             return False
535         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
536         return (content, urlh)
537
538     @staticmethod
539     def _guess_encoding_from_content(content_type, webpage_bytes):
540         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
541         if m:
542             encoding = m.group(1)
543         else:
544             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
545                           webpage_bytes[:1024])
546             if m:
547                 encoding = m.group(1).decode('ascii')
548             elif webpage_bytes.startswith(b'\xff\xfe'):
549                 encoding = 'utf-16'
550             else:
551                 encoding = 'utf-8'
552
553         return encoding
554
555     def __check_blocked(self, content):
556         first_block = content[:512]
557         if ('<title>Access to this site is blocked</title>' in content and
558                 'Websense' in first_block):
559             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
560             blocked_iframe = self._html_search_regex(
561                 r'<iframe src="([^"]+)"', content,
562                 'Websense information URL', default=None)
563             if blocked_iframe:
564                 msg += ' Visit %s for more details' % blocked_iframe
565             raise ExtractorError(msg, expected=True)
566         if '<title>The URL you requested has been blocked</title>' in first_block:
567             msg = (
568                 'Access to this webpage has been blocked by Indian censorship. '
569                 'Use a VPN or proxy server (with --proxy) to route around it.')
570             block_msg = self._html_search_regex(
571                 r'</h1><p>(.*?)</p>',
572                 content, 'block message', default=None)
573             if block_msg:
574                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
575             raise ExtractorError(msg, expected=True)
576         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
577                 'blocklist.rkn.gov.ru' in content):
578             raise ExtractorError(
579                 'Access to this webpage has been blocked by decision of the Russian government. '
580                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
581                 expected=True)
582
583     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
584         content_type = urlh.headers.get('Content-Type', '')
585         webpage_bytes = urlh.read()
586         if prefix is not None:
587             webpage_bytes = prefix + webpage_bytes
588         if not encoding:
589             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
590         if self._downloader.params.get('dump_intermediate_pages', False):
591             try:
592                 url = url_or_request.get_full_url()
593             except AttributeError:
594                 url = url_or_request
595             self.to_screen('Dumping request to ' + url)
596             dump = base64.b64encode(webpage_bytes).decode('ascii')
597             self._downloader.to_screen(dump)
598         if self._downloader.params.get('write_pages', False):
599             try:
600                 url = url_or_request.get_full_url()
601             except AttributeError:
602                 url = url_or_request
603             basen = '%s_%s' % (video_id, url)
604             if len(basen) > 240:
605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
606                 basen = basen[:240 - len(h)] + h
607             raw_filename = basen + '.dump'
608             filename = sanitize_filename(raw_filename, restricted=True)
609             self.to_screen('Saving request to ' + filename)
610             # Working around MAX_PATH limitation on Windows (see
611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
612             if compat_os_name == 'nt':
613                 absfilepath = os.path.abspath(filename)
614                 if len(absfilepath) > 259:
615                     filename = '\\\\?\\' + absfilepath
616             with open(filename, 'wb') as outf:
617                 outf.write(webpage_bytes)
618
619         try:
620             content = webpage_bytes.decode(encoding, 'replace')
621         except LookupError:
622             content = webpage_bytes.decode('utf-8', 'replace')
623
624         self.__check_blocked(content)
625
626         return content
627
628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
629         """ Returns the data of the page as a string """
630         success = False
631         try_count = 0
632         while success is False:
633             try:
634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
635                 success = True
636             except compat_http_client.IncompleteRead as e:
637                 try_count += 1
638                 if try_count >= tries:
639                     raise e
640                 self._sleep(timeout, video_id)
641         if res is False:
642             return res
643         else:
644             content, _ = res
645             return content
646
647     def _download_xml(self, url_or_request, video_id,
648                       note='Downloading XML', errnote='Unable to download XML',
649                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
650         """Return the xml as an xml.etree.ElementTree.Element"""
651         xml_string = self._download_webpage(
652             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
653         if xml_string is False:
654             return xml_string
655         if transform_source:
656             xml_string = transform_source(xml_string)
657         return compat_etree_fromstring(xml_string.encode('utf-8'))
658
659     def _download_json(self, url_or_request, video_id,
660                        note='Downloading JSON metadata',
661                        errnote='Unable to download JSON metadata',
662                        transform_source=None,
663                        fatal=True, encoding=None, data=None, headers={}, query={}):
664         json_string = self._download_webpage(
665             url_or_request, video_id, note, errnote, fatal=fatal,
666             encoding=encoding, data=data, headers=headers, query=query)
667         if (not fatal) and json_string is False:
668             return None
669         return self._parse_json(
670             json_string, video_id, transform_source=transform_source, fatal=fatal)
671
672     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
673         if transform_source:
674             json_string = transform_source(json_string)
675         try:
676             return json.loads(json_string)
677         except ValueError as ve:
678             errmsg = '%s: Failed to parse JSON ' % video_id
679             if fatal:
680                 raise ExtractorError(errmsg, cause=ve)
681             else:
682                 self.report_warning(errmsg + str(ve))
683
684     def report_warning(self, msg, video_id=None):
685         idstr = '' if video_id is None else '%s: ' % video_id
686         self._downloader.report_warning(
687             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
688
689     def to_screen(self, msg):
690         """Print msg to screen, prefixing it with '[ie_name]'"""
691         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
692
693     def report_extraction(self, id_or_name):
694         """Report information extraction."""
695         self.to_screen('%s: Extracting information' % id_or_name)
696
697     def report_download_webpage(self, video_id):
698         """Report webpage download."""
699         self.to_screen('%s: Downloading webpage' % video_id)
700
701     def report_age_confirmation(self):
702         """Report attempt to confirm age."""
703         self.to_screen('Confirming age')
704
705     def report_login(self):
706         """Report attempt to log in."""
707         self.to_screen('Logging in')
708
709     @staticmethod
710     def raise_login_required(msg='This video is only available for registered users'):
711         raise ExtractorError(
712             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
713             expected=True)
714
715     @staticmethod
716     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
717         raise GeoRestrictedError(msg, countries=countries)
718
719     # Methods for following #608
720     @staticmethod
721     def url_result(url, ie=None, video_id=None, video_title=None):
722         """Returns a URL that points to a page that should be processed"""
723         # TODO: ie should be the class used for getting the info
724         video_info = {'_type': 'url',
725                       'url': url,
726                       'ie_key': ie}
727         if video_id is not None:
728             video_info['id'] = video_id
729         if video_title is not None:
730             video_info['title'] = video_title
731         return video_info
732
733     def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
734         urlrs = orderedSet(
735             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
736             for m in matches)
737         return self.playlist_result(
738             urlrs, playlist_id=video_id, playlist_title=video_title)
739
740     @staticmethod
741     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
742         """Returns a playlist"""
743         video_info = {'_type': 'playlist',
744                       'entries': entries}
745         if playlist_id:
746             video_info['id'] = playlist_id
747         if playlist_title:
748             video_info['title'] = playlist_title
749         if playlist_description:
750             video_info['description'] = playlist_description
751         return video_info
752
753     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
754         """
755         Perform a regex search on the given string, using a single or a list of
756         patterns returning the first matching group.
757         In case of failure return a default value or raise a WARNING or a
758         RegexNotFoundError, depending on fatal, specifying the field name.
759         """
760         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
761             mobj = re.search(pattern, string, flags)
762         else:
763             for p in pattern:
764                 mobj = re.search(p, string, flags)
765                 if mobj:
766                     break
767
768         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
769             _name = '\033[0;34m%s\033[0m' % name
770         else:
771             _name = name
772
773         if mobj:
774             if group is None:
775                 # return the first matching group
776                 return next(g for g in mobj.groups() if g is not None)
777             else:
778                 return mobj.group(group)
779         elif default is not NO_DEFAULT:
780             return default
781         elif fatal:
782             raise RegexNotFoundError('Unable to extract %s' % _name)
783         else:
784             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
785             return None
786
787     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
788         """
789         Like _search_regex, but strips HTML tags and unescapes entities.
790         """
791         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
792         if res:
793             return clean_html(res).strip()
794         else:
795             return res
796
797     def _get_netrc_login_info(self, netrc_machine=None):
798         username = None
799         password = None
800         netrc_machine = netrc_machine or self._NETRC_MACHINE
801
802         if self._downloader.params.get('usenetrc', False):
803             try:
804                 info = netrc.netrc().authenticators(netrc_machine)
805                 if info is not None:
806                     username = info[0]
807                     password = info[2]
808                 else:
809                     raise netrc.NetrcParseError(
810                         'No authenticators for %s' % netrc_machine)
811             except (IOError, netrc.NetrcParseError) as err:
812                 self._downloader.report_warning(
813                     'parsing .netrc: %s' % error_to_compat_str(err))
814
815         return username, password
816
817     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
818         """
819         Get the login info as (username, password)
820         First look for the manually specified credentials using username_option
821         and password_option as keys in params dictionary. If no such credentials
822         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
823         value.
824         If there's no info available, return (None, None)
825         """
826         if self._downloader is None:
827             return (None, None)
828
829         downloader_params = self._downloader.params
830
831         # Attempt to use provided username and password or .netrc data
832         if downloader_params.get(username_option) is not None:
833             username = downloader_params[username_option]
834             password = downloader_params[password_option]
835         else:
836             username, password = self._get_netrc_login_info(netrc_machine)
837
838         return username, password
839
840     def _get_tfa_info(self, note='two-factor verification code'):
841         """
842         Get the two-factor authentication info
843         TODO - asking the user will be required for sms/phone verify
844         currently just uses the command line option
845         If there's no info available, return None
846         """
847         if self._downloader is None:
848             return None
849         downloader_params = self._downloader.params
850
851         if downloader_params.get('twofactor') is not None:
852             return downloader_params['twofactor']
853
854         return compat_getpass('Type %s and press [Return]: ' % note)
855
856     # Helper functions for extracting OpenGraph info
857     @staticmethod
858     def _og_regexes(prop):
859         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
860         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
861                        % {'prop': re.escape(prop)})
862         template = r'<meta[^>]+?%s[^>]+?%s'
863         return [
864             template % (property_re, content_re),
865             template % (content_re, property_re),
866         ]
867
868     @staticmethod
869     def _meta_regex(prop):
870         return r'''(?isx)<meta
871                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
872                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
873
874     def _og_search_property(self, prop, html, name=None, **kargs):
875         if not isinstance(prop, (list, tuple)):
876             prop = [prop]
877         if name is None:
878             name = 'OpenGraph %s' % prop[0]
879         og_regexes = []
880         for p in prop:
881             og_regexes.extend(self._og_regexes(p))
882         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
883         if escaped is None:
884             return None
885         return unescapeHTML(escaped)
886
887     def _og_search_thumbnail(self, html, **kargs):
888         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
889
890     def _og_search_description(self, html, **kargs):
891         return self._og_search_property('description', html, fatal=False, **kargs)
892
893     def _og_search_title(self, html, **kargs):
894         return self._og_search_property('title', html, **kargs)
895
896     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
897         regexes = self._og_regexes('video') + self._og_regexes('video:url')
898         if secure:
899             regexes = self._og_regexes('video:secure_url') + regexes
900         return self._html_search_regex(regexes, html, name, **kargs)
901
902     def _og_search_url(self, html, **kargs):
903         return self._og_search_property('url', html, **kargs)
904
905     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
906         if not isinstance(name, (list, tuple)):
907             name = [name]
908         if display_name is None:
909             display_name = name[0]
910         return self._html_search_regex(
911             [self._meta_regex(n) for n in name],
912             html, display_name, fatal=fatal, group='content', **kwargs)
913
914     def _dc_search_uploader(self, html):
915         return self._html_search_meta('dc.creator', html, 'uploader')
916
917     def _rta_search(self, html):
918         # See http://www.rtalabel.org/index.php?content=howtofaq#single
919         if re.search(r'(?ix)<meta\s+name="rating"\s+'
920                      r'     content="RTA-5042-1996-1400-1577-RTA"',
921                      html):
922             return 18
923         return 0
924
925     def _media_rating_search(self, html):
926         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
927         rating = self._html_search_meta('rating', html)
928
929         if not rating:
930             return None
931
932         RATING_TABLE = {
933             'safe for kids': 0,
934             'general': 8,
935             '14 years': 14,
936             'mature': 17,
937             'restricted': 19,
938         }
939         return RATING_TABLE.get(rating.lower())
940
941     def _family_friendly_search(self, html):
942         # See http://schema.org/VideoObject
943         family_friendly = self._html_search_meta('isFamilyFriendly', html)
944
945         if not family_friendly:
946             return None
947
948         RATING_TABLE = {
949             '1': 0,
950             'true': 0,
951             '0': 18,
952             'false': 18,
953         }
954         return RATING_TABLE.get(family_friendly.lower())
955
956     def _twitter_search_player(self, html):
957         return self._html_search_meta('twitter:player', html,
958                                       'twitter card player')
959
960     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
961         json_ld = self._search_regex(
962             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
963             html, 'JSON-LD', group='json_ld', **kwargs)
964         default = kwargs.get('default', NO_DEFAULT)
965         if not json_ld:
966             return default if default is not NO_DEFAULT else {}
967         # JSON-LD may be malformed and thus `fatal` should be respected.
968         # At the same time `default` may be passed that assumes `fatal=False`
969         # for _search_regex. Let's simulate the same behavior here as well.
970         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
971         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
972
973     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
974         if isinstance(json_ld, compat_str):
975             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
976         if not json_ld:
977             return {}
978         info = {}
979         if not isinstance(json_ld, (list, tuple, dict)):
980             return info
981         if isinstance(json_ld, dict):
982             json_ld = [json_ld]
983
984         def extract_video_object(e):
985             assert e['@type'] == 'VideoObject'
986             info.update({
987                 'url': e.get('contentUrl'),
988                 'title': unescapeHTML(e.get('name')),
989                 'description': unescapeHTML(e.get('description')),
990                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
991                 'duration': parse_duration(e.get('duration')),
992                 'timestamp': unified_timestamp(e.get('uploadDate')),
993                 'filesize': float_or_none(e.get('contentSize')),
994                 'tbr': int_or_none(e.get('bitrate')),
995                 'width': int_or_none(e.get('width')),
996                 'height': int_or_none(e.get('height')),
997                 'view_count': int_or_none(e.get('interactionCount')),
998             })
999
1000         for e in json_ld:
1001             if e.get('@context') == 'http://schema.org':
1002                 item_type = e.get('@type')
1003                 if expected_type is not None and expected_type != item_type:
1004                     return info
1005                 if item_type == 'TVEpisode':
1006                     info.update({
1007                         'episode': unescapeHTML(e.get('name')),
1008                         'episode_number': int_or_none(e.get('episodeNumber')),
1009                         'description': unescapeHTML(e.get('description')),
1010                     })
1011                     part_of_season = e.get('partOfSeason')
1012                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
1013                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1014                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1015                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
1016                         info['series'] = unescapeHTML(part_of_series.get('name'))
1017                 elif item_type == 'Article':
1018                     info.update({
1019                         'timestamp': parse_iso8601(e.get('datePublished')),
1020                         'title': unescapeHTML(e.get('headline')),
1021                         'description': unescapeHTML(e.get('articleBody')),
1022                     })
1023                 elif item_type == 'VideoObject':
1024                     extract_video_object(e)
1025                 elif item_type == 'WebPage':
1026                     video = e.get('video')
1027                     if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1028                         extract_video_object(video)
1029                 break
1030         return dict((k, v) for k, v in info.items() if v is not None)
1031
1032     @staticmethod
1033     def _hidden_inputs(html):
1034         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1035         hidden_inputs = {}
1036         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1037             attrs = extract_attributes(input)
1038             if not input:
1039                 continue
1040             if attrs.get('type') not in ('hidden', 'submit'):
1041                 continue
1042             name = attrs.get('name') or attrs.get('id')
1043             value = attrs.get('value')
1044             if name and value is not None:
1045                 hidden_inputs[name] = value
1046         return hidden_inputs
1047
1048     def _form_hidden_inputs(self, form_id, html):
1049         form = self._search_regex(
1050             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1051             html, '%s form' % form_id, group='form')
1052         return self._hidden_inputs(form)
1053
1054     def _sort_formats(self, formats, field_preference=None):
1055         if not formats:
1056             raise ExtractorError('No video formats found')
1057
1058         for f in formats:
1059             # Automatically determine tbr when missing based on abr and vbr (improves
1060             # formats sorting in some cases)
1061             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1062                 f['tbr'] = f['abr'] + f['vbr']
1063
1064         def _formats_key(f):
1065             # TODO remove the following workaround
1066             from ..utils import determine_ext
1067             if not f.get('ext') and 'url' in f:
1068                 f['ext'] = determine_ext(f['url'])
1069
1070             if isinstance(field_preference, (list, tuple)):
1071                 return tuple(
1072                     f.get(field)
1073                     if f.get(field) is not None
1074                     else ('' if field == 'format_id' else -1)
1075                     for field in field_preference)
1076
1077             preference = f.get('preference')
1078             if preference is None:
1079                 preference = 0
1080                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1081                     preference -= 0.5
1082
1083             protocol = f.get('protocol') or determine_protocol(f)
1084             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1085
1086             if f.get('vcodec') == 'none':  # audio only
1087                 preference -= 50
1088                 if self._downloader.params.get('prefer_free_formats'):
1089                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1090                 else:
1091                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1092                 ext_preference = 0
1093                 try:
1094                     audio_ext_preference = ORDER.index(f['ext'])
1095                 except ValueError:
1096                     audio_ext_preference = -1
1097             else:
1098                 if f.get('acodec') == 'none':  # video only
1099                     preference -= 40
1100                 if self._downloader.params.get('prefer_free_formats'):
1101                     ORDER = ['flv', 'mp4', 'webm']
1102                 else:
1103                     ORDER = ['webm', 'flv', 'mp4']
1104                 try:
1105                     ext_preference = ORDER.index(f['ext'])
1106                 except ValueError:
1107                     ext_preference = -1
1108                 audio_ext_preference = 0
1109
1110             return (
1111                 preference,
1112                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1113                 f.get('quality') if f.get('quality') is not None else -1,
1114                 f.get('tbr') if f.get('tbr') is not None else -1,
1115                 f.get('filesize') if f.get('filesize') is not None else -1,
1116                 f.get('vbr') if f.get('vbr') is not None else -1,
1117                 f.get('height') if f.get('height') is not None else -1,
1118                 f.get('width') if f.get('width') is not None else -1,
1119                 proto_preference,
1120                 ext_preference,
1121                 f.get('abr') if f.get('abr') is not None else -1,
1122                 audio_ext_preference,
1123                 f.get('fps') if f.get('fps') is not None else -1,
1124                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1125                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1126                 f.get('format_id') if f.get('format_id') is not None else '',
1127             )
1128         formats.sort(key=_formats_key)
1129
1130     def _check_formats(self, formats, video_id):
1131         if formats:
1132             formats[:] = filter(
1133                 lambda f: self._is_valid_url(
1134                     f['url'], video_id,
1135                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1136                 formats)
1137
1138     @staticmethod
1139     def _remove_duplicate_formats(formats):
1140         format_urls = set()
1141         unique_formats = []
1142         for f in formats:
1143             if f['url'] not in format_urls:
1144                 format_urls.add(f['url'])
1145                 unique_formats.append(f)
1146         formats[:] = unique_formats
1147
1148     def _is_valid_url(self, url, video_id, item='video', headers={}):
1149         url = self._proto_relative_url(url, scheme='http:')
1150         # For now assume non HTTP(S) URLs always valid
1151         if not (url.startswith('http://') or url.startswith('https://')):
1152             return True
1153         try:
1154             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1155             return True
1156         except ExtractorError as e:
1157             if isinstance(e.cause, compat_urllib_error.URLError):
1158                 self.to_screen(
1159                     '%s: %s URL is invalid, skipping' % (video_id, item))
1160                 return False
1161             raise
1162
1163     def http_scheme(self):
1164         """ Either "http:" or "https:", depending on the user's preferences """
1165         return (
1166             'http:'
1167             if self._downloader.params.get('prefer_insecure', False)
1168             else 'https:')
1169
1170     def _proto_relative_url(self, url, scheme=None):
1171         if url is None:
1172             return url
1173         if url.startswith('//'):
1174             if scheme is None:
1175                 scheme = self.http_scheme()
1176             return scheme + url
1177         else:
1178             return url
1179
1180     def _sleep(self, timeout, video_id, msg_template=None):
1181         if msg_template is None:
1182             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1183         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1184         self.to_screen(msg)
1185         time.sleep(timeout)
1186
1187     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1188                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1189                              fatal=True, m3u8_id=None):
1190         manifest = self._download_xml(
1191             manifest_url, video_id, 'Downloading f4m manifest',
1192             'Unable to download f4m manifest',
1193             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1194             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1195             transform_source=transform_source,
1196             fatal=fatal)
1197
1198         if manifest is False:
1199             return []
1200
1201         return self._parse_f4m_formats(
1202             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1203             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1204
1205     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1206                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1207                            fatal=True, m3u8_id=None):
1208         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1209         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1210         if akamai_pv is not None and ';' in akamai_pv.text:
1211             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1212             if playerVerificationChallenge.strip() != '':
1213                 return []
1214
1215         formats = []
1216         manifest_version = '1.0'
1217         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1218         if not media_nodes:
1219             manifest_version = '2.0'
1220             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1221         # Remove unsupported DRM protected media from final formats
1222         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1223         media_nodes = remove_encrypted_media(media_nodes)
1224         if not media_nodes:
1225             return formats
1226         base_url = xpath_text(
1227             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1228             'base URL', default=None)
1229         if base_url:
1230             base_url = base_url.strip()
1231
1232         bootstrap_info = xpath_element(
1233             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1234             'bootstrap info', default=None)
1235
1236         vcodec = None
1237         mime_type = xpath_text(
1238             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1239             'base URL', default=None)
1240         if mime_type and mime_type.startswith('audio/'):
1241             vcodec = 'none'
1242
1243         for i, media_el in enumerate(media_nodes):
1244             tbr = int_or_none(media_el.attrib.get('bitrate'))
1245             width = int_or_none(media_el.attrib.get('width'))
1246             height = int_or_none(media_el.attrib.get('height'))
1247             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1248             # If <bootstrapInfo> is present, the specified f4m is a
1249             # stream-level manifest, and only set-level manifests may refer to
1250             # external resources.  See section 11.4 and section 4 of F4M spec
1251             if bootstrap_info is None:
1252                 media_url = None
1253                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1254                 if manifest_version == '2.0':
1255                     media_url = media_el.attrib.get('href')
1256                 if media_url is None:
1257                     media_url = media_el.attrib.get('url')
1258                 if not media_url:
1259                     continue
1260                 manifest_url = (
1261                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1262                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1263                 # If media_url is itself a f4m manifest do the recursive extraction
1264                 # since bitrates in parent manifest (this one) and media_url manifest
1265                 # may differ leading to inability to resolve the format by requested
1266                 # bitrate in f4m downloader
1267                 ext = determine_ext(manifest_url)
1268                 if ext == 'f4m':
1269                     f4m_formats = self._extract_f4m_formats(
1270                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1271                         transform_source=transform_source, fatal=fatal)
1272                     # Sometimes stream-level manifest contains single media entry that
1273                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1274                     # At the same time parent's media entry in set-level manifest may
1275                     # contain it. We will copy it from parent in such cases.
1276                     if len(f4m_formats) == 1:
1277                         f = f4m_formats[0]
1278                         f.update({
1279                             'tbr': f.get('tbr') or tbr,
1280                             'width': f.get('width') or width,
1281                             'height': f.get('height') or height,
1282                             'format_id': f.get('format_id') if not tbr else format_id,
1283                             'vcodec': vcodec,
1284                         })
1285                     formats.extend(f4m_formats)
1286                     continue
1287                 elif ext == 'm3u8':
1288                     formats.extend(self._extract_m3u8_formats(
1289                         manifest_url, video_id, 'mp4', preference=preference,
1290                         m3u8_id=m3u8_id, fatal=fatal))
1291                     continue
1292             formats.append({
1293                 'format_id': format_id,
1294                 'url': manifest_url,
1295                 'manifest_url': manifest_url,
1296                 'ext': 'flv' if bootstrap_info is not None else None,
1297                 'tbr': tbr,
1298                 'width': width,
1299                 'height': height,
1300                 'vcodec': vcodec,
1301                 'preference': preference,
1302             })
1303         return formats
1304
1305     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1306         return {
1307             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1308             'url': m3u8_url,
1309             'ext': ext,
1310             'protocol': 'm3u8',
1311             'preference': preference - 100 if preference else -100,
1312             'resolution': 'multiple',
1313             'format_note': 'Quality selection URL',
1314         }
1315
1316     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1317                               entry_protocol='m3u8', preference=None,
1318                               m3u8_id=None, note=None, errnote=None,
1319                               fatal=True, live=False):
1320         res = self._download_webpage_handle(
1321             m3u8_url, video_id,
1322             note=note or 'Downloading m3u8 information',
1323             errnote=errnote or 'Failed to download m3u8 information',
1324             fatal=fatal)
1325
1326         if res is False:
1327             return []
1328
1329         m3u8_doc, urlh = res
1330         m3u8_url = urlh.geturl()
1331
1332         return self._parse_m3u8_formats(
1333             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1334             preference=preference, m3u8_id=m3u8_id, live=live)
1335
1336     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1337                             entry_protocol='m3u8', preference=None,
1338                             m3u8_id=None, live=False):
1339         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1340             return []
1341
1342         formats = []
1343
1344         format_url = lambda u: (
1345             u
1346             if re.match(r'^https?://', u)
1347             else compat_urlparse.urljoin(m3u8_url, u))
1348
1349         # References:
1350         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1351         # 2. https://github.com/rg3/youtube-dl/issues/12211
1352
1353         # We should try extracting formats only from master playlists [1, 4.3.4],
1354         # i.e. playlists that describe available qualities. On the other hand
1355         # media playlists [1, 4.3.3] should be returned as is since they contain
1356         # just the media without qualities renditions.
1357         # Fortunately, master playlist can be easily distinguished from media
1358         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1359         # master playlist tags MUST NOT appear in a media playist and vice versa.
1360         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1361         # media playlist and MUST NOT appear in master playlist thus we can
1362         # clearly detect media playlist with this criterion.
1363
1364         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1365             return [{
1366                 'url': m3u8_url,
1367                 'format_id': m3u8_id,
1368                 'ext': ext,
1369                 'protocol': entry_protocol,
1370                 'preference': preference,
1371             }]
1372
1373         groups = {}
1374         last_stream_inf = {}
1375
1376         def extract_media(x_media_line):
1377             media = parse_m3u8_attributes(x_media_line)
1378             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1379             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1380             if not (media_type and group_id and name):
1381                 return
1382             groups.setdefault(group_id, []).append(media)
1383             if media_type not in ('VIDEO', 'AUDIO'):
1384                 return
1385             media_url = media.get('URI')
1386             if media_url:
1387                 format_id = []
1388                 for v in (group_id, name):
1389                     if v:
1390                         format_id.append(v)
1391                 f = {
1392                     'format_id': '-'.join(format_id),
1393                     'url': format_url(media_url),
1394                     'manifest_url': m3u8_url,
1395                     'language': media.get('LANGUAGE'),
1396                     'ext': ext,
1397                     'protocol': entry_protocol,
1398                     'preference': preference,
1399                 }
1400                 if media_type == 'AUDIO':
1401                     f['vcodec'] = 'none'
1402                 formats.append(f)
1403
1404         def build_stream_name():
1405             # Despite specification does not mention NAME attribute for
1406             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1407             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1408             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1409             stream_name = last_stream_inf.get('NAME')
1410             if stream_name:
1411                 return stream_name
1412             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1413             # from corresponding rendition group
1414             stream_group_id = last_stream_inf.get('VIDEO')
1415             if not stream_group_id:
1416                 return
1417             stream_group = groups.get(stream_group_id)
1418             if not stream_group:
1419                 return stream_group_id
1420             rendition = stream_group[0]
1421             return rendition.get('NAME') or stream_group_id
1422
1423         for line in m3u8_doc.splitlines():
1424             if line.startswith('#EXT-X-STREAM-INF:'):
1425                 last_stream_inf = parse_m3u8_attributes(line)
1426             elif line.startswith('#EXT-X-MEDIA:'):
1427                 extract_media(line)
1428             elif line.startswith('#') or not line.strip():
1429                 continue
1430             else:
1431                 tbr = float_or_none(
1432                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1433                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1434                 format_id = []
1435                 if m3u8_id:
1436                     format_id.append(m3u8_id)
1437                 stream_name = build_stream_name()
1438                 # Bandwidth of live streams may differ over time thus making
1439                 # format_id unpredictable. So it's better to keep provided
1440                 # format_id intact.
1441                 if not live:
1442                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1443                 manifest_url = format_url(line.strip())
1444                 f = {
1445                     'format_id': '-'.join(format_id),
1446                     'url': manifest_url,
1447                     'manifest_url': m3u8_url,
1448                     'tbr': tbr,
1449                     'ext': ext,
1450                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1451                     'protocol': entry_protocol,
1452                     'preference': preference,
1453                 }
1454                 resolution = last_stream_inf.get('RESOLUTION')
1455                 if resolution:
1456                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1457                     if mobj:
1458                         f['width'] = int(mobj.group('width'))
1459                         f['height'] = int(mobj.group('height'))
1460                 # Unified Streaming Platform
1461                 mobj = re.search(
1462                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1463                 if mobj:
1464                     abr, vbr = mobj.groups()
1465                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1466                     f.update({
1467                         'vbr': vbr,
1468                         'abr': abr,
1469                     })
1470                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1471                 f.update(codecs)
1472                 audio_group_id = last_stream_inf.get('AUDIO')
1473                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1474                 # references a rendition group MUST have a CODECS attribute.
1475                 # However, this is not always respected, for example, [2]
1476                 # contains EXT-X-STREAM-INF tag which references AUDIO
1477                 # rendition group but does not have CODECS and despite
1478                 # referencing audio group an audio group, it represents
1479                 # a complete (with audio and video) format. So, for such cases
1480                 # we will ignore references to rendition groups and treat them
1481                 # as complete formats.
1482                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1483                     audio_group = groups.get(audio_group_id)
1484                     if audio_group and audio_group[0].get('URI'):
1485                         # TODO: update acodec for audio only formats with
1486                         # the same GROUP-ID
1487                         f['acodec'] = 'none'
1488                 formats.append(f)
1489                 last_stream_inf = {}
1490         return formats
1491
1492     @staticmethod
1493     def _xpath_ns(path, namespace=None):
1494         if not namespace:
1495             return path
1496         out = []
1497         for c in path.split('/'):
1498             if not c or c == '.':
1499                 out.append(c)
1500             else:
1501                 out.append('{%s}%s' % (namespace, c))
1502         return '/'.join(out)
1503
1504     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1505         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1506
1507         if smil is False:
1508             assert not fatal
1509             return []
1510
1511         namespace = self._parse_smil_namespace(smil)
1512
1513         return self._parse_smil_formats(
1514             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1515
1516     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1517         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1518         if smil is False:
1519             return {}
1520         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1521
1522     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1523         return self._download_xml(
1524             smil_url, video_id, 'Downloading SMIL file',
1525             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1526
1527     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1528         namespace = self._parse_smil_namespace(smil)
1529
1530         formats = self._parse_smil_formats(
1531             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1532         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1533
1534         video_id = os.path.splitext(url_basename(smil_url))[0]
1535         title = None
1536         description = None
1537         upload_date = None
1538         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1539             name = meta.attrib.get('name')
1540             content = meta.attrib.get('content')
1541             if not name or not content:
1542                 continue
1543             if not title and name == 'title':
1544                 title = content
1545             elif not description and name in ('description', 'abstract'):
1546                 description = content
1547             elif not upload_date and name == 'date':
1548                 upload_date = unified_strdate(content)
1549
1550         thumbnails = [{
1551             'id': image.get('type'),
1552             'url': image.get('src'),
1553             'width': int_or_none(image.get('width')),
1554             'height': int_or_none(image.get('height')),
1555         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1556
1557         return {
1558             'id': video_id,
1559             'title': title or video_id,
1560             'description': description,
1561             'upload_date': upload_date,
1562             'thumbnails': thumbnails,
1563             'formats': formats,
1564             'subtitles': subtitles,
1565         }
1566
1567     def _parse_smil_namespace(self, smil):
1568         return self._search_regex(
1569             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1570
1571     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1572         base = smil_url
1573         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1574             b = meta.get('base') or meta.get('httpBase')
1575             if b:
1576                 base = b
1577                 break
1578
1579         formats = []
1580         rtmp_count = 0
1581         http_count = 0
1582         m3u8_count = 0
1583
1584         srcs = []
1585         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1586         for medium in media:
1587             src = medium.get('src')
1588             if not src or src in srcs:
1589                 continue
1590             srcs.append(src)
1591
1592             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1593             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1594             width = int_or_none(medium.get('width'))
1595             height = int_or_none(medium.get('height'))
1596             proto = medium.get('proto')
1597             ext = medium.get('ext')
1598             src_ext = determine_ext(src)
1599             streamer = medium.get('streamer') or base
1600
1601             if proto == 'rtmp' or streamer.startswith('rtmp'):
1602                 rtmp_count += 1
1603                 formats.append({
1604                     'url': streamer,
1605                     'play_path': src,
1606                     'ext': 'flv',
1607                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1608                     'tbr': bitrate,
1609                     'filesize': filesize,
1610                     'width': width,
1611                     'height': height,
1612                 })
1613                 if transform_rtmp_url:
1614                     streamer, src = transform_rtmp_url(streamer, src)
1615                     formats[-1].update({
1616                         'url': streamer,
1617                         'play_path': src,
1618                     })
1619                 continue
1620
1621             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1622             src_url = src_url.strip()
1623
1624             if proto == 'm3u8' or src_ext == 'm3u8':
1625                 m3u8_formats = self._extract_m3u8_formats(
1626                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1627                 if len(m3u8_formats) == 1:
1628                     m3u8_count += 1
1629                     m3u8_formats[0].update({
1630                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1631                         'tbr': bitrate,
1632                         'width': width,
1633                         'height': height,
1634                     })
1635                 formats.extend(m3u8_formats)
1636                 continue
1637
1638             if src_ext == 'f4m':
1639                 f4m_url = src_url
1640                 if not f4m_params:
1641                     f4m_params = {
1642                         'hdcore': '3.2.0',
1643                         'plugin': 'flowplayer-3.2.0.1',
1644                     }
1645                 f4m_url += '&' if '?' in f4m_url else '?'
1646                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1647                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1648                 continue
1649
1650             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1651                 http_count += 1
1652                 formats.append({
1653                     'url': src_url,
1654                     'ext': ext or src_ext or 'flv',
1655                     'format_id': 'http-%d' % (bitrate or http_count),
1656                     'tbr': bitrate,
1657                     'filesize': filesize,
1658                     'width': width,
1659                     'height': height,
1660                 })
1661                 continue
1662
1663         return formats
1664
1665     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1666         urls = []
1667         subtitles = {}
1668         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1669             src = textstream.get('src')
1670             if not src or src in urls:
1671                 continue
1672             urls.append(src)
1673             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1674             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1675             subtitles.setdefault(lang, []).append({
1676                 'url': src,
1677                 'ext': ext,
1678             })
1679         return subtitles
1680
1681     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1682         xspf = self._download_xml(
1683             playlist_url, playlist_id, 'Downloading xpsf playlist',
1684             'Unable to download xspf manifest', fatal=fatal)
1685         if xspf is False:
1686             return []
1687         return self._parse_xspf(xspf, playlist_id)
1688
1689     def _parse_xspf(self, playlist, playlist_id):
1690         NS_MAP = {
1691             'xspf': 'http://xspf.org/ns/0/',
1692             's1': 'http://static.streamone.nl/player/ns/0',
1693         }
1694
1695         entries = []
1696         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1697             title = xpath_text(
1698                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1699             description = xpath_text(
1700                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1701             thumbnail = xpath_text(
1702                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1703             duration = float_or_none(
1704                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1705
1706             formats = [{
1707                 'url': location.text,
1708                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1709                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1710                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1711             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1712             self._sort_formats(formats)
1713
1714             entries.append({
1715                 'id': playlist_id,
1716                 'title': title,
1717                 'description': description,
1718                 'thumbnail': thumbnail,
1719                 'duration': duration,
1720                 'formats': formats,
1721             })
1722         return entries
1723
1724     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1725         res = self._download_webpage_handle(
1726             mpd_url, video_id,
1727             note=note or 'Downloading MPD manifest',
1728             errnote=errnote or 'Failed to download MPD manifest',
1729             fatal=fatal)
1730         if res is False:
1731             return []
1732         mpd, urlh = res
1733         mpd_base_url = base_url(urlh.geturl())
1734
1735         return self._parse_mpd_formats(
1736             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1737             formats_dict=formats_dict, mpd_url=mpd_url)
1738
1739     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1740         """
1741         Parse formats from MPD manifest.
1742         References:
1743          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1744             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1745          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1746         """
1747         if mpd_doc.get('type') == 'dynamic':
1748             return []
1749
1750         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1751
1752         def _add_ns(path):
1753             return self._xpath_ns(path, namespace)
1754
1755         def is_drm_protected(element):
1756             return element.find(_add_ns('ContentProtection')) is not None
1757
1758         def extract_multisegment_info(element, ms_parent_info):
1759             ms_info = ms_parent_info.copy()
1760
1761             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1762             # common attributes and elements.  We will only extract relevant
1763             # for us.
1764             def extract_common(source):
1765                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1766                 if segment_timeline is not None:
1767                     s_e = segment_timeline.findall(_add_ns('S'))
1768                     if s_e:
1769                         ms_info['total_number'] = 0
1770                         ms_info['s'] = []
1771                         for s in s_e:
1772                             r = int(s.get('r', 0))
1773                             ms_info['total_number'] += 1 + r
1774                             ms_info['s'].append({
1775                                 't': int(s.get('t', 0)),
1776                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1777                                 'd': int(s.attrib['d']),
1778                                 'r': r,
1779                             })
1780                 start_number = source.get('startNumber')
1781                 if start_number:
1782                     ms_info['start_number'] = int(start_number)
1783                 timescale = source.get('timescale')
1784                 if timescale:
1785                     ms_info['timescale'] = int(timescale)
1786                 segment_duration = source.get('duration')
1787                 if segment_duration:
1788                     ms_info['segment_duration'] = int(segment_duration)
1789
1790             def extract_Initialization(source):
1791                 initialization = source.find(_add_ns('Initialization'))
1792                 if initialization is not None:
1793                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1794
1795             segment_list = element.find(_add_ns('SegmentList'))
1796             if segment_list is not None:
1797                 extract_common(segment_list)
1798                 extract_Initialization(segment_list)
1799                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1800                 if segment_urls_e:
1801                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1802             else:
1803                 segment_template = element.find(_add_ns('SegmentTemplate'))
1804                 if segment_template is not None:
1805                     extract_common(segment_template)
1806                     media = segment_template.get('media')
1807                     if media:
1808                         ms_info['media'] = media
1809                     initialization = segment_template.get('initialization')
1810                     if initialization:
1811                         ms_info['initialization'] = initialization
1812                     else:
1813                         extract_Initialization(segment_template)
1814             return ms_info
1815
1816         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1817         formats = []
1818         for period in mpd_doc.findall(_add_ns('Period')):
1819             period_duration = parse_duration(period.get('duration')) or mpd_duration
1820             period_ms_info = extract_multisegment_info(period, {
1821                 'start_number': 1,
1822                 'timescale': 1,
1823             })
1824             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1825                 if is_drm_protected(adaptation_set):
1826                     continue
1827                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1828                 for representation in adaptation_set.findall(_add_ns('Representation')):
1829                     if is_drm_protected(representation):
1830                         continue
1831                     representation_attrib = adaptation_set.attrib.copy()
1832                     representation_attrib.update(representation.attrib)
1833                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1834                     mime_type = representation_attrib['mimeType']
1835                     content_type = mime_type.split('/')[0]
1836                     if content_type == 'text':
1837                         # TODO implement WebVTT downloading
1838                         pass
1839                     elif content_type in ('video', 'audio'):
1840                         base_url = ''
1841                         for element in (representation, adaptation_set, period, mpd_doc):
1842                             base_url_e = element.find(_add_ns('BaseURL'))
1843                             if base_url_e is not None:
1844                                 base_url = base_url_e.text + base_url
1845                                 if re.match(r'^https?://', base_url):
1846                                     break
1847                         if mpd_base_url and not re.match(r'^https?://', base_url):
1848                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1849                                 mpd_base_url += '/'
1850                             base_url = mpd_base_url + base_url
1851                         representation_id = representation_attrib.get('id')
1852                         lang = representation_attrib.get('lang')
1853                         url_el = representation.find(_add_ns('BaseURL'))
1854                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1855                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1856                         f = {
1857                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1858                             'url': base_url,
1859                             'manifest_url': mpd_url,
1860                             'ext': mimetype2ext(mime_type),
1861                             'width': int_or_none(representation_attrib.get('width')),
1862                             'height': int_or_none(representation_attrib.get('height')),
1863                             'tbr': float_or_none(bandwidth, 1000),
1864                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1865                             'fps': int_or_none(representation_attrib.get('frameRate')),
1866                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1867                             'format_note': 'DASH %s' % content_type,
1868                             'filesize': filesize,
1869                         }
1870                         f.update(parse_codecs(representation_attrib.get('codecs')))
1871                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1872
1873                         def prepare_template(template_name, identifiers):
1874                             t = representation_ms_info[template_name]
1875                             t = t.replace('$RepresentationID$', representation_id)
1876                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1877                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1878                             t.replace('$$', '$')
1879                             return t
1880
1881                         # @initialization is a regular template like @media one
1882                         # so it should be handled just the same way (see
1883                         # https://github.com/rg3/youtube-dl/issues/11605)
1884                         if 'initialization' in representation_ms_info:
1885                             initialization_template = prepare_template(
1886                                 'initialization',
1887                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1888                                 # $Time$ shall not be included for @initialization thus
1889                                 # only $Bandwidth$ remains
1890                                 ('Bandwidth', ))
1891                             representation_ms_info['initialization_url'] = initialization_template % {
1892                                 'Bandwidth': bandwidth,
1893                             }
1894
1895                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1896
1897                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1898
1899                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1900                             # can't be used at the same time
1901                             if '%(Number' in media_template and 's' not in representation_ms_info:
1902                                 segment_duration = None
1903                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1904                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1905                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1906                                 representation_ms_info['fragments'] = [{
1907                                     'url': media_template % {
1908                                         'Number': segment_number,
1909                                         'Bandwidth': bandwidth,
1910                                     },
1911                                     'duration': segment_duration,
1912                                 } for segment_number in range(
1913                                     representation_ms_info['start_number'],
1914                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1915                             else:
1916                                 # $Number*$ or $Time$ in media template with S list available
1917                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1918                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1919                                 representation_ms_info['fragments'] = []
1920                                 segment_time = 0
1921                                 segment_d = None
1922                                 segment_number = representation_ms_info['start_number']
1923
1924                                 def add_segment_url():
1925                                     segment_url = media_template % {
1926                                         'Time': segment_time,
1927                                         'Bandwidth': bandwidth,
1928                                         'Number': segment_number,
1929                                     }
1930                                     representation_ms_info['fragments'].append({
1931                                         'url': segment_url,
1932                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1933                                     })
1934
1935                                 for num, s in enumerate(representation_ms_info['s']):
1936                                     segment_time = s.get('t') or segment_time
1937                                     segment_d = s['d']
1938                                     add_segment_url()
1939                                     segment_number += 1
1940                                     for r in range(s.get('r', 0)):
1941                                         segment_time += segment_d
1942                                         add_segment_url()
1943                                         segment_number += 1
1944                                     segment_time += segment_d
1945                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1946                             # No media template
1947                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1948                             # or any YouTube dashsegments video
1949                             fragments = []
1950                             segment_index = 0
1951                             timescale = representation_ms_info['timescale']
1952                             for s in representation_ms_info['s']:
1953                                 duration = float_or_none(s['d'], timescale)
1954                                 for r in range(s.get('r', 0) + 1):
1955                                     fragments.append({
1956                                         'url': representation_ms_info['segment_urls'][segment_index],
1957                                         'duration': duration,
1958                                     })
1959                                     segment_index += 1
1960                             representation_ms_info['fragments'] = fragments
1961                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1962                         # No fragments key is present in this case.
1963                         if 'fragments' in representation_ms_info:
1964                             f.update({
1965                                 'fragments': [],
1966                                 'protocol': 'http_dash_segments',
1967                             })
1968                             if 'initialization_url' in representation_ms_info:
1969                                 initialization_url = representation_ms_info['initialization_url']
1970                                 if not f.get('url'):
1971                                     f['url'] = initialization_url
1972                                 f['fragments'].append({'url': initialization_url})
1973                             f['fragments'].extend(representation_ms_info['fragments'])
1974                             for fragment in f['fragments']:
1975                                 fragment['url'] = urljoin(base_url, fragment['url'])
1976                         try:
1977                             existing_format = next(
1978                                 fo for fo in formats
1979                                 if fo['format_id'] == representation_id)
1980                         except StopIteration:
1981                             full_info = formats_dict.get(representation_id, {}).copy()
1982                             full_info.update(f)
1983                             formats.append(full_info)
1984                         else:
1985                             existing_format.update(f)
1986                     else:
1987                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1988         return formats
1989
1990     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1991         res = self._download_webpage_handle(
1992             ism_url, video_id,
1993             note=note or 'Downloading ISM manifest',
1994             errnote=errnote or 'Failed to download ISM manifest',
1995             fatal=fatal)
1996         if res is False:
1997             return []
1998         ism, urlh = res
1999
2000         return self._parse_ism_formats(
2001             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2002
2003     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2004         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2005             return []
2006
2007         duration = int(ism_doc.attrib['Duration'])
2008         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2009
2010         formats = []
2011         for stream in ism_doc.findall('StreamIndex'):
2012             stream_type = stream.get('Type')
2013             if stream_type not in ('video', 'audio'):
2014                 continue
2015             url_pattern = stream.attrib['Url']
2016             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2017             stream_name = stream.get('Name')
2018             for track in stream.findall('QualityLevel'):
2019                 fourcc = track.get('FourCC')
2020                 # TODO: add support for WVC1 and WMAP
2021                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2022                     self.report_warning('%s is not a supported codec' % fourcc)
2023                     continue
2024                 tbr = int(track.attrib['Bitrate']) // 1000
2025                 width = int_or_none(track.get('MaxWidth'))
2026                 height = int_or_none(track.get('MaxHeight'))
2027                 sampling_rate = int_or_none(track.get('SamplingRate'))
2028
2029                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2030                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2031
2032                 fragments = []
2033                 fragment_ctx = {
2034                     'time': 0,
2035                 }
2036                 stream_fragments = stream.findall('c')
2037                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2038                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2039                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2040                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2041                     if not fragment_ctx['duration']:
2042                         try:
2043                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2044                         except IndexError:
2045                             next_fragment_time = duration
2046                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2047                     for _ in range(fragment_repeat):
2048                         fragments.append({
2049                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2050                             'duration': fragment_ctx['duration'] / stream_timescale,
2051                         })
2052                         fragment_ctx['time'] += fragment_ctx['duration']
2053
2054                 format_id = []
2055                 if ism_id:
2056                     format_id.append(ism_id)
2057                 if stream_name:
2058                     format_id.append(stream_name)
2059                 format_id.append(compat_str(tbr))
2060
2061                 formats.append({
2062                     'format_id': '-'.join(format_id),
2063                     'url': ism_url,
2064                     'manifest_url': ism_url,
2065                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2066                     'width': width,
2067                     'height': height,
2068                     'tbr': tbr,
2069                     'asr': sampling_rate,
2070                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2071                     'acodec': 'none' if stream_type == 'video' else fourcc,
2072                     'protocol': 'ism',
2073                     'fragments': fragments,
2074                     '_download_params': {
2075                         'duration': duration,
2076                         'timescale': stream_timescale,
2077                         'width': width or 0,
2078                         'height': height or 0,
2079                         'fourcc': fourcc,
2080                         'codec_private_data': track.get('CodecPrivateData'),
2081                         'sampling_rate': sampling_rate,
2082                         'channels': int_or_none(track.get('Channels', 2)),
2083                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2084                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2085                     },
2086                 })
2087         return formats
2088
2089     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2090         def absolute_url(video_url):
2091             return compat_urlparse.urljoin(base_url, video_url)
2092
2093         def parse_content_type(content_type):
2094             if not content_type:
2095                 return {}
2096             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2097             if ctr:
2098                 mimetype, codecs = ctr.groups()
2099                 f = parse_codecs(codecs)
2100                 f['ext'] = mimetype2ext(mimetype)
2101                 return f
2102             return {}
2103
2104         def _media_formats(src, cur_media_type):
2105             full_url = absolute_url(src)
2106             ext = determine_ext(full_url)
2107             if ext == 'm3u8':
2108                 is_plain_url = False
2109                 formats = self._extract_m3u8_formats(
2110                     full_url, video_id, ext='mp4',
2111                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2112                     preference=preference)
2113             elif ext == 'mpd':
2114                 is_plain_url = False
2115                 formats = self._extract_mpd_formats(
2116                     full_url, video_id, mpd_id=mpd_id)
2117             else:
2118                 is_plain_url = True
2119                 formats = [{
2120                     'url': full_url,
2121                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2122                 }]
2123             return is_plain_url, formats
2124
2125         entries = []
2126         media_tags = [(media_tag, media_type, '')
2127                       for media_tag, media_type
2128                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2129         media_tags.extend(re.findall(
2130             # We only allow video|audio followed by a whitespace or '>'.
2131             # Allowing more characters may end up in significant slow down (see
2132             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2133             # http://www.porntrex.com/maps/videositemap.xml).
2134             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2135         for media_tag, media_type, media_content in media_tags:
2136             media_info = {
2137                 'formats': [],
2138                 'subtitles': {},
2139             }
2140             media_attributes = extract_attributes(media_tag)
2141             src = media_attributes.get('src')
2142             if src:
2143                 _, formats = _media_formats(src, media_type)
2144                 media_info['formats'].extend(formats)
2145             media_info['thumbnail'] = media_attributes.get('poster')
2146             if media_content:
2147                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2148                     source_attributes = extract_attributes(source_tag)
2149                     src = source_attributes.get('src')
2150                     if not src:
2151                         continue
2152                     is_plain_url, formats = _media_formats(src, media_type)
2153                     if is_plain_url:
2154                         f = parse_content_type(source_attributes.get('type'))
2155                         f.update(formats[0])
2156                         media_info['formats'].append(f)
2157                     else:
2158                         media_info['formats'].extend(formats)
2159                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2160                     track_attributes = extract_attributes(track_tag)
2161                     kind = track_attributes.get('kind')
2162                     if not kind or kind in ('subtitles', 'captions'):
2163                         src = track_attributes.get('src')
2164                         if not src:
2165                             continue
2166                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2167                         media_info['subtitles'].setdefault(lang, []).append({
2168                             'url': absolute_url(src),
2169                         })
2170             if media_info['formats'] or media_info['subtitles']:
2171                 entries.append(media_info)
2172         return entries
2173
2174     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2175         formats = []
2176         hdcore_sign = 'hdcore=3.7.0'
2177         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2178         hds_host = hosts.get('hds')
2179         if hds_host:
2180             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2181         if 'hdcore=' not in f4m_url:
2182             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2183         f4m_formats = self._extract_f4m_formats(
2184             f4m_url, video_id, f4m_id='hds', fatal=False)
2185         for entry in f4m_formats:
2186             entry.update({'extra_param_to_segment_url': hdcore_sign})
2187         formats.extend(f4m_formats)
2188         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2189         hls_host = hosts.get('hls')
2190         if hls_host:
2191             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2192         formats.extend(self._extract_m3u8_formats(
2193             m3u8_url, video_id, 'mp4', 'm3u8_native',
2194             m3u8_id='hls', fatal=False))
2195         return formats
2196
2197     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2198         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2199         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2200         http_base_url = 'http' + url_base
2201         formats = []
2202         if 'm3u8' not in skip_protocols:
2203             formats.extend(self._extract_m3u8_formats(
2204                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2205                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2206         if 'f4m' not in skip_protocols:
2207             formats.extend(self._extract_f4m_formats(
2208                 http_base_url + '/manifest.f4m',
2209                 video_id, f4m_id='hds', fatal=False))
2210         if 'dash' not in skip_protocols:
2211             formats.extend(self._extract_mpd_formats(
2212                 http_base_url + '/manifest.mpd',
2213                 video_id, mpd_id='dash', fatal=False))
2214         if re.search(r'(?:/smil:|\.smil)', url_base):
2215             if 'smil' not in skip_protocols:
2216                 rtmp_formats = self._extract_smil_formats(
2217                     http_base_url + '/jwplayer.smil',
2218                     video_id, fatal=False)
2219                 for rtmp_format in rtmp_formats:
2220                     rtsp_format = rtmp_format.copy()
2221                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2222                     del rtsp_format['play_path']
2223                     del rtsp_format['ext']
2224                     rtsp_format.update({
2225                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2226                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2227                         'protocol': 'rtsp',
2228                     })
2229                     formats.extend([rtmp_format, rtsp_format])
2230         else:
2231             for protocol in ('rtmp', 'rtsp'):
2232                 if protocol not in skip_protocols:
2233                     formats.append({
2234                         'url': protocol + url_base,
2235                         'format_id': protocol,
2236                         'protocol': protocol,
2237                     })
2238         return formats
2239
2240     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2241         mobj = re.search(
2242             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2243             webpage)
2244         if mobj:
2245             try:
2246                 jwplayer_data = self._parse_json(mobj.group('options'),
2247                                                  video_id=video_id,
2248                                                  transform_source=transform_source)
2249             except ExtractorError:
2250                 pass
2251             else:
2252                 if isinstance(jwplayer_data, dict):
2253                     return jwplayer_data
2254
2255     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2256         jwplayer_data = self._find_jwplayer_data(
2257             webpage, video_id, transform_source=js_to_json)
2258         return self._parse_jwplayer_data(
2259             jwplayer_data, video_id, *args, **kwargs)
2260
2261     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2262                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2263         # JWPlayer backward compatibility: flattened playlists
2264         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2265         if 'playlist' not in jwplayer_data:
2266             jwplayer_data = {'playlist': [jwplayer_data]}
2267
2268         entries = []
2269
2270         # JWPlayer backward compatibility: single playlist item
2271         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2272         if not isinstance(jwplayer_data['playlist'], list):
2273             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2274
2275         for video_data in jwplayer_data['playlist']:
2276             # JWPlayer backward compatibility: flattened sources
2277             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2278             if 'sources' not in video_data:
2279                 video_data['sources'] = [video_data]
2280
2281             this_video_id = video_id or video_data['mediaid']
2282
2283             formats = self._parse_jwplayer_formats(
2284                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2285                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2286             self._sort_formats(formats)
2287
2288             subtitles = {}
2289             tracks = video_data.get('tracks')
2290             if tracks and isinstance(tracks, list):
2291                 for track in tracks:
2292                     if track.get('kind') != 'captions':
2293                         continue
2294                     track_url = urljoin(base_url, track.get('file'))
2295                     if not track_url:
2296                         continue
2297                     subtitles.setdefault(track.get('label') or 'en', []).append({
2298                         'url': self._proto_relative_url(track_url)
2299                     })
2300
2301             entries.append({
2302                 'id': this_video_id,
2303                 'title': video_data['title'] if require_title else video_data.get('title'),
2304                 'description': video_data.get('description'),
2305                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2306                 'timestamp': int_or_none(video_data.get('pubdate')),
2307                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2308                 'subtitles': subtitles,
2309                 'formats': formats,
2310             })
2311         if len(entries) == 1:
2312             return entries[0]
2313         else:
2314             return self.playlist_result(entries)
2315
2316     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2317                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2318         urls = []
2319         formats = []
2320         for source in jwplayer_sources_data:
2321             source_url = self._proto_relative_url(source.get('file'))
2322             if not source_url:
2323                 continue
2324             if base_url:
2325                 source_url = compat_urlparse.urljoin(base_url, source_url)
2326             if source_url in urls:
2327                 continue
2328             urls.append(source_url)
2329             source_type = source.get('type') or ''
2330             ext = mimetype2ext(source_type) or determine_ext(source_url)
2331             if source_type == 'hls' or ext == 'm3u8':
2332                 formats.extend(self._extract_m3u8_formats(
2333                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2334                     m3u8_id=m3u8_id, fatal=False))
2335             elif ext == 'mpd':
2336                 formats.extend(self._extract_mpd_formats(
2337                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2338             elif ext == 'smil':
2339                 formats.extend(self._extract_smil_formats(
2340                     source_url, video_id, fatal=False))
2341             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2342             elif source_type.startswith('audio') or ext in (
2343                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2344                 formats.append({
2345                     'url': source_url,
2346                     'vcodec': 'none',
2347                     'ext': ext,
2348                 })
2349             else:
2350                 height = int_or_none(source.get('height'))
2351                 if height is None:
2352                     # Often no height is provided but there is a label in
2353                     # format like "1080p", "720p SD", or 1080.
2354                     height = int_or_none(self._search_regex(
2355                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2356                         'height', default=None))
2357                 a_format = {
2358                     'url': source_url,
2359                     'width': int_or_none(source.get('width')),
2360                     'height': height,
2361                     'tbr': int_or_none(source.get('bitrate')),
2362                     'ext': ext,
2363                 }
2364                 if source_url.startswith('rtmp'):
2365                     a_format['ext'] = 'flv'
2366                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2367                     # of jwplayer.flash.swf
2368                     rtmp_url_parts = re.split(
2369                         r'((?:mp4|mp3|flv):)', source_url, 1)
2370                     if len(rtmp_url_parts) == 3:
2371                         rtmp_url, prefix, play_path = rtmp_url_parts
2372                         a_format.update({
2373                             'url': rtmp_url,
2374                             'play_path': prefix + play_path,
2375                         })
2376                     if rtmp_params:
2377                         a_format.update(rtmp_params)
2378                 formats.append(a_format)
2379         return formats
2380
2381     def _live_title(self, name):
2382         """ Generate the title for a live video """
2383         now = datetime.datetime.now()
2384         now_str = now.strftime('%Y-%m-%d %H:%M')
2385         return name + ' ' + now_str
2386
2387     def _int(self, v, name, fatal=False, **kwargs):
2388         res = int_or_none(v, **kwargs)
2389         if 'get_attr' in kwargs:
2390             print(getattr(v, kwargs['get_attr']))
2391         if res is None:
2392             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2393             if fatal:
2394                 raise ExtractorError(msg)
2395             else:
2396                 self._downloader.report_warning(msg)
2397         return res
2398
2399     def _float(self, v, name, fatal=False, **kwargs):
2400         res = float_or_none(v, **kwargs)
2401         if res is None:
2402             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2403             if fatal:
2404                 raise ExtractorError(msg)
2405             else:
2406                 self._downloader.report_warning(msg)
2407         return res
2408
2409     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2410                     path='/', secure=False, discard=False, rest={}, **kwargs):
2411         cookie = compat_cookiejar.Cookie(
2412             0, name, value, port, not port is None, domain, True,
2413             domain.startswith('.'), path, True, secure, expire_time,
2414             discard, None, None, rest)
2415         self._downloader.cookiejar.set_cookie(cookie)
2416
2417     def _get_cookies(self, url):
2418         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2419         req = sanitized_Request(url)
2420         self._downloader.cookiejar.add_cookie_header(req)
2421         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2422
2423     def get_testcases(self, include_onlymatching=False):
2424         t = getattr(self, '_TEST', None)
2425         if t:
2426             assert not hasattr(self, '_TESTS'), \
2427                 '%s has _TEST and _TESTS' % type(self).__name__
2428             tests = [t]
2429         else:
2430             tests = getattr(self, '_TESTS', [])
2431         for t in tests:
2432             if not include_onlymatching and t.get('only_matching', False):
2433                 continue
2434             t['name'] = type(self).__name__[:-len('IE')]
2435             yield t
2436
2437     def is_suitable(self, age_limit):
2438         """ Test whether the extractor is generally suitable for the given
2439         age limit (i.e. pornographic sites are not, all others usually are) """
2440
2441         any_restricted = False
2442         for tc in self.get_testcases(include_onlymatching=False):
2443             if tc.get('playlist', []):
2444                 tc = tc['playlist'][0]
2445             is_restricted = age_restricted(
2446                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2447             if not is_restricted:
2448                 return True
2449             any_restricted = any_restricted or is_restricted
2450         return not any_restricted
2451
2452     def extract_subtitles(self, *args, **kwargs):
2453         if (self._downloader.params.get('writesubtitles', False) or
2454                 self._downloader.params.get('listsubtitles')):
2455             return self._get_subtitles(*args, **kwargs)
2456         return {}
2457
2458     def _get_subtitles(self, *args, **kwargs):
2459         raise NotImplementedError('This method must be implemented by subclasses')
2460
2461     @staticmethod
2462     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2463         """ Merge subtitle items for one language. Items with duplicated URLs
2464         will be dropped. """
2465         list1_urls = set([item['url'] for item in subtitle_list1])
2466         ret = list(subtitle_list1)
2467         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2468         return ret
2469
2470     @classmethod
2471     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2472         """ Merge two subtitle dictionaries, language by language. """
2473         ret = dict(subtitle_dict1)
2474         for lang in subtitle_dict2:
2475             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2476         return ret
2477
2478     def extract_automatic_captions(self, *args, **kwargs):
2479         if (self._downloader.params.get('writeautomaticsub', False) or
2480                 self._downloader.params.get('listsubtitles')):
2481             return self._get_automatic_captions(*args, **kwargs)
2482         return {}
2483
2484     def _get_automatic_captions(self, *args, **kwargs):
2485         raise NotImplementedError('This method must be implemented by subclasses')
2486
2487     def mark_watched(self, *args, **kwargs):
2488         if (self._downloader.params.get('mark_watched', False) and
2489                 (self._get_login_info()[0] is not None or
2490                     self._downloader.params.get('cookiefile') is not None)):
2491             self._mark_watched(*args, **kwargs)
2492
2493     def _mark_watched(self, *args, **kwargs):
2494         raise NotImplementedError('This method must be implemented by subclasses')
2495
2496     def geo_verification_headers(self):
2497         headers = {}
2498         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2499         if geo_verification_proxy:
2500             headers['Ytdl-request-proxy'] = geo_verification_proxy
2501         return headers
2502
2503     def _generic_id(self, url):
2504         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2505
2506     def _generic_title(self, url):
2507         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2508
2509
2510 class SearchInfoExtractor(InfoExtractor):
2511     """
2512     Base class for paged search queries extractors.
2513     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2514     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2515     """
2516
2517     @classmethod
2518     def _make_valid_url(cls):
2519         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2520
2521     @classmethod
2522     def suitable(cls, url):
2523         return re.match(cls._make_valid_url(), url) is not None
2524
2525     def _real_extract(self, query):
2526         mobj = re.match(self._make_valid_url(), query)
2527         if mobj is None:
2528             raise ExtractorError('Invalid search query "%s"' % query)
2529
2530         prefix = mobj.group('prefix')
2531         query = mobj.group('query')
2532         if prefix == '':
2533             return self._get_n_results(query, 1)
2534         elif prefix == 'all':
2535             return self._get_n_results(query, self._MAX_RESULTS)
2536         else:
2537             n = int(prefix)
2538             if n <= 0:
2539                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2540             elif n > self._MAX_RESULTS:
2541                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2542                 n = self._MAX_RESULTS
2543             return self._get_n_results(query, n)
2544
2545     def _get_n_results(self, query, n):
2546         """Get a specified number of results for a query"""
2547         raise NotImplementedError('This method must be implemented by subclasses')
2548
2549     @property
2550     def SEARCH_KEY(self):
2551         return self._SEARCH_KEY