1 from __future__ import unicode_literals
16 from ..compat import (
19 compat_etree_fromstring,
25 compat_urllib_parse_unquote,
26 compat_urllib_parse_urlencode,
27 compat_urllib_request,
30 from ..downloader.f4m import remove_encrypted_media
63 parse_m3u8_attributes,
70 class InfoExtractor(object):
71 """Information Extractor class.
73 Information extractors are the classes that, given a URL, extract
74 information about the video (or videos) the URL refers to. This
75 information includes the real video URL, the video title, author and
76 others. The information is stored in a dictionary which is then
77 passed to the YoutubeDL. The YoutubeDL processes this
78 information possibly downloading the video to the file system, among
79 other possible outcomes.
81 The type field determines the type of the result.
82 By far the most common value (and the default if _type is missing) is
83 "video", which indicates a single video.
85 For a video, the dictionaries must include the following fields:
88 title: Video title, unescaped.
90 Additionally, it must contain either a formats entry or a url one:
92 formats: A list of dictionaries for each format available, ordered
93 from worst to best quality.
96 * url Mandatory. The URL of the video file
98 The URL of the manifest file in case of
99 fragmented media (DASH, hls, hds)
100 * ext Will be calculated from URL if missing
101 * format A human-readable description of the format
102 ("mp4 container with h264/opus").
103 Calculated from the format_id, width, height.
104 and format_note fields if missing.
105 * format_id A short description of the format
106 ("mp4_h264_opus" or "19").
107 Technically optional, but strongly recommended.
108 * format_note Additional info about the format
109 ("3D" or "DASH video")
110 * width Width of the video, if known
111 * height Height of the video, if known
112 * resolution Textual description of width and height
113 * tbr Average bitrate of audio and video in KBit/s
114 * abr Average audio bitrate in KBit/s
115 * acodec Name of the audio codec in use
116 * asr Audio sampling rate in Hertz
117 * vbr Average video bitrate in KBit/s
119 * vcodec Name of the video codec in use
120 * container Name of the container format
121 * filesize The number of bytes, if known in advance
122 * filesize_approx An estimate for the number of bytes
123 * player_url SWF Player URL (used for rtmpdump).
124 * protocol The protocol that will be used for the actual
125 download, lower-case.
126 "http", "https", "rtsp", "rtmp", "rtmpe",
127 "m3u8", "m3u8_native" or "http_dash_segments".
129 Base URL for fragments. Each fragment's path
130 value (if present) will be relative to
132 * fragments A list of fragments of a fragmented media.
133 Each fragment entry must contain either an url
134 or a path. If an url is present it should be
135 considered by a client. Otherwise both path and
136 fragment_base_url must be present. Here is
137 the list of all potential fields:
138 * "url" - fragment's URL
139 * "path" - fragment's path relative to
141 * "duration" (optional, int or float)
142 * "filesize" (optional, int)
143 * preference Order number of this format. If this field is
144 present and not None, the formats get sorted
145 by this field, regardless of all other values.
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 < -1000 to hide the format (if there is
149 another one which is strictly better)
150 * language Language code, e.g. "de" or "en-US".
151 * language_preference Is this in the language mentioned in
153 10 if it's what the URL is about,
154 -1 for default (don't know),
155 -10 otherwise, other values reserved for now.
156 * quality Order number of the video quality of this
157 format, irrespective of the file format.
158 -1 for default (order by other properties),
159 -2 or smaller for less than default.
160 * source_preference Order number for this video source
161 (quality takes higher priority)
162 -1 for default (order by other properties),
163 -2 or smaller for less than default.
164 * http_headers A dictionary of additional HTTP headers
165 to add to the request.
166 * stretched_ratio If given and not 1, indicates that the
167 video's pixels are not square.
168 width : height ratio as float.
169 * no_resume The server does not support resuming the
170 (HTTP or RTMP) download. Boolean.
172 url: Final video URL.
173 ext: Video filename extension.
174 format: The video format, defaults to ext (used for --get-format)
175 player_url: SWF Player URL (used for rtmpdump).
177 The following fields are optional:
179 alt_title: A secondary title of the video.
180 display_id An alternative identifier for the video, not necessarily
181 unique, but available before title. Typically, id is
182 something like "4234987", title "Dancing naked mole rats",
183 and display_id "dancing-naked-mole-rats"
184 thumbnails: A list of dictionaries, with the following entries:
185 * "id" (optional, string) - Thumbnail format ID
187 * "preference" (optional, int) - quality of the image
188 * "width" (optional, int)
189 * "height" (optional, int)
190 * "resolution" (optional, string "{width}x{height"},
192 * "filesize" (optional, int)
193 thumbnail: Full URL to a video thumbnail image.
194 description: Full video description.
195 uploader: Full name of the video uploader.
196 license: License name the video is licensed under.
197 creator: The creator of the video.
198 release_date: The date (YYYYMMDD) when the video was released.
199 timestamp: UNIX timestamp of the moment the video became available.
200 upload_date: Video upload date (YYYYMMDD).
201 If not explicitly set, calculated from timestamp.
202 uploader_id: Nickname or id of the video uploader.
203 uploader_url: Full URL to a personal webpage of the video uploader.
204 location: Physical location where the video was filmed.
205 subtitles: The available subtitles as a dictionary in the format
206 {tag: subformats}. "tag" is usually a language code, and
207 "subformats" is a list sorted from lower to higher
208 preference, each element is a dictionary with the "ext"
210 * "data": The subtitles file contents
211 * "url": A URL pointing to the subtitles file
212 "ext" will be calculated from URL if missing
213 automatic_captions: Like 'subtitles', used by the YoutubeIE for
214 automatically generated captions
215 duration: Length of the video in seconds, as an integer or float.
216 view_count: How many users have watched the video on the platform.
217 like_count: Number of positive ratings of the video
218 dislike_count: Number of negative ratings of the video
219 repost_count: Number of reposts of the video
220 average_rating: Average rating give by users, the scale used depends on the webpage
221 comment_count: Number of comments on the video
222 comments: A list of comments, each with one or more of the following
223 properties (all but one of text or html optional):
224 * "author" - human-readable name of the comment author
225 * "author_id" - user ID of the comment author
227 * "html" - Comment as HTML
228 * "text" - Plain text of the comment
229 * "timestamp" - UNIX timestamp of comment
230 * "parent" - ID of the comment this one is replying to.
231 Set to "root" to indicate that this is a
232 comment to the original video.
233 age_limit: Age restriction for the video, as an integer (years)
234 webpage_url: The URL to the video webpage, if given to youtube-dl it
235 should allow to get the same result again. (It will be set
236 by YoutubeDL if it's missing)
237 categories: A list of categories that the video falls in, for example
239 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
240 is_live: True, False, or None (=unknown). Whether this video is a
241 live stream that goes on instead of a fixed-length video.
242 start_time: Time in seconds where the reproduction should start, as
243 specified in the URL.
244 end_time: Time in seconds where the reproduction should end, as
245 specified in the URL.
247 The following fields should only be used when the video belongs to some logical
250 chapter: Name or title of the chapter the video belongs to.
251 chapter_number: Number of the chapter the video belongs to, as an integer.
252 chapter_id: Id of the chapter the video belongs to, as a unicode string.
254 The following fields should only be used when the video is an episode of some
255 series, programme or podcast:
257 series: Title of the series or programme the video episode belongs to.
258 season: Title of the season the video episode belongs to.
259 season_number: Number of the season the video episode belongs to, as an integer.
260 season_id: Id of the season the video episode belongs to, as a unicode string.
261 episode: Title of the video episode. Unlike mandatory video title field,
262 this field should denote the exact title of the video episode
263 without any kind of decoration.
264 episode_number: Number of the video episode within a season, as an integer.
265 episode_id: Id of the video episode, as a unicode string.
267 The following fields should only be used when the media is a track or a part of
270 track: Title of the track.
271 track_number: Number of the track within an album or a disc, as an integer.
272 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 artist: Artist(s) of the track.
275 genre: Genre(s) of the track.
276 album: Title of the album the track belongs to.
277 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
278 album_artist: List of all artists appeared on the album (e.g.
279 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 disc_number: Number of the disc or other physical medium the track belongs to,
283 release_year: Year (YYYY) when the album was released.
285 Unless mentioned otherwise, the fields should be Unicode strings.
287 Unless mentioned otherwise, None is equivalent to absence of information.
290 _type "playlist" indicates multiple videos.
291 There must be a key "entries", which is a list, an iterable, or a PagedList
292 object, each element of which is a valid dictionary by this specification.
294 Additionally, playlists can have "title", "description" and "id" attributes
295 with the same semantics as videos (see above).
298 _type "multi_video" indicates that there are multiple videos that
299 form a single show, for examples multiple acts of an opera or TV episode.
300 It must have an entries key like a playlist and contain all the keys
301 required for a video at the same time.
304 _type "url" indicates that the video must be extracted from another
305 location, possibly by a different extractor. Its only required key is:
306 "url" - the next URL to extract.
307 The key "ie_key" can be set to the class name (minus the trailing "IE",
308 e.g. "Youtube") if the extractor class is known in advance.
309 Additionally, the dictionary may have any properties of the resolved entity
310 known in advance, for example "title" if the title of the referred video is
314 _type "url_transparent" entities have the same specification as "url", but
315 indicate that the given additional information is more precise than the one
316 associated with the resolved URL.
317 This is useful when a site employs a video service that hosts the video and
318 its technical metadata, but that video service does not embed a useful
319 title, description etc.
322 Subclasses of this one should re-define the _real_initialize() and
323 _real_extract() methods and define a _VALID_URL regexp.
324 Probably, they should also be added to the list of extractors.
326 _GEO_BYPASS attribute may be set to False in order to disable
327 geo restriction bypass mechanisms for a particular extractor.
328 Though it won't disable explicit geo restriction bypass based on
329 country code provided with geo_bypass_country. (experimental)
331 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
332 countries for this extractor. One of these countries will be used by
333 geo restriction bypass mechanism right away in order to bypass
334 geo restriction, of course, if the mechanism is not disabled. (experimental)
336 Finally, the _WORKING attribute should be set to False for broken IEs
337 in order to warn the users and skip the tests.
342 _x_forwarded_for_ip = None
344 _GEO_COUNTRIES = None
347 def __init__(self, downloader=None):
348 """Constructor. Receives an optional downloader."""
350 self._x_forwarded_for_ip = None
351 self.set_downloader(downloader)
354 def suitable(cls, url):
355 """Receives a URL and returns True if suitable for this IE."""
357 # This does not use has/getattr intentionally - we want to know whether
358 # we have cached the regexp for *this* class, whereas getattr would also
359 # match the superclass
360 if '_VALID_URL_RE' not in cls.__dict__:
361 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
362 return cls._VALID_URL_RE.match(url) is not None
365 def _match_id(cls, url):
366 if '_VALID_URL_RE' not in cls.__dict__:
367 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
368 m = cls._VALID_URL_RE.match(url)
374 """Getter method for _WORKING."""
377 def initialize(self):
378 """Initializes an instance (authentication, etc)."""
379 self.__initialize_geo_bypass()
381 self._real_initialize()
384 def __initialize_geo_bypass(self):
385 if not self._x_forwarded_for_ip:
386 country_code = self._downloader.params.get('geo_bypass_country', None)
387 # If there is no explicit country for geo bypass specified and
388 # the extractor is known to be geo restricted let's fake IP
389 # as X-Forwarded-For right away.
390 if (not country_code and
392 self._downloader.params.get('geo_bypass', True) and
393 self._GEO_COUNTRIES):
394 country_code = random.choice(self._GEO_COUNTRIES)
396 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
397 if self._downloader.params.get('verbose', False):
398 self._downloader.to_stdout(
399 '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
401 def extract(self, url):
402 """Extracts URL information and returns it in list of dicts."""
407 ie_result = self._real_extract(url)
408 if self._x_forwarded_for_ip:
409 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
411 except GeoRestrictedError as e:
412 if self.__maybe_fake_ip_and_retry(e.countries):
415 except ExtractorError:
417 except compat_http_client.IncompleteRead as e:
418 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
419 except (KeyError, StopIteration) as e:
420 raise ExtractorError('An extractor error has occurred.', cause=e)
422 def __maybe_fake_ip_and_retry(self, countries):
423 if (not self._downloader.params.get('geo_bypass_country', None) and
425 self._downloader.params.get('geo_bypass', True) and
426 not self._x_forwarded_for_ip and
428 self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
429 if self._x_forwarded_for_ip:
431 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
435 def set_downloader(self, downloader):
436 """Sets the downloader for this IE."""
437 self._downloader = downloader
439 def _real_initialize(self):
440 """Real initialization process. Redefine in subclasses."""
443 def _real_extract(self, url):
444 """Real extraction process. Redefine in subclasses."""
449 """A string for getting the InfoExtractor with get_info_extractor"""
450 return compat_str(cls.__name__[:-2])
454 return compat_str(type(self).__name__[:-2])
456 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
457 """ Returns the response handle """
459 self.report_download_webpage(video_id)
460 elif note is not False:
462 self.to_screen('%s' % (note,))
464 self.to_screen('%s: %s' % (video_id, note))
465 if isinstance(url_or_request, compat_urllib_request.Request):
466 url_or_request = update_Request(
467 url_or_request, data=data, headers=headers, query=query)
470 url_or_request = update_url_query(url_or_request, query)
471 if data is not None or headers:
472 url_or_request = sanitized_Request(url_or_request, data, headers)
474 return self._downloader.urlopen(url_or_request)
475 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
479 errnote = 'Unable to download webpage'
481 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
483 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
485 self._downloader.report_warning(errmsg)
488 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
489 """ Returns a tuple (page content as string, URL handle) """
490 # Strip hashes from the URL (#1038)
491 if isinstance(url_or_request, (compat_str, str)):
492 url_or_request = url_or_request.partition('#')[0]
494 # Some sites check X-Forwarded-For HTTP header in order to figure out
495 # the origin of the client behind proxy. This allows bypassing geo
496 # restriction by faking this header's value to IP that belongs to some
497 # geo unrestricted country. We will do so once we encounter any
498 # geo restriction error.
499 if self._x_forwarded_for_ip:
500 if 'X-Forwarded-For' not in headers:
501 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
503 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
507 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
508 return (content, urlh)
511 def _guess_encoding_from_content(content_type, webpage_bytes):
512 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
514 encoding = m.group(1)
516 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
517 webpage_bytes[:1024])
519 encoding = m.group(1).decode('ascii')
520 elif webpage_bytes.startswith(b'\xff\xfe'):
527 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
528 content_type = urlh.headers.get('Content-Type', '')
529 webpage_bytes = urlh.read()
530 if prefix is not None:
531 webpage_bytes = prefix + webpage_bytes
533 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
534 if self._downloader.params.get('dump_intermediate_pages', False):
536 url = url_or_request.get_full_url()
537 except AttributeError:
539 self.to_screen('Dumping request to ' + url)
540 dump = base64.b64encode(webpage_bytes).decode('ascii')
541 self._downloader.to_screen(dump)
542 if self._downloader.params.get('write_pages', False):
544 url = url_or_request.get_full_url()
545 except AttributeError:
547 basen = '%s_%s' % (video_id, url)
549 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
550 basen = basen[:240 - len(h)] + h
551 raw_filename = basen + '.dump'
552 filename = sanitize_filename(raw_filename, restricted=True)
553 self.to_screen('Saving request to ' + filename)
554 # Working around MAX_PATH limitation on Windows (see
555 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
556 if compat_os_name == 'nt':
557 absfilepath = os.path.abspath(filename)
558 if len(absfilepath) > 259:
559 filename = '\\\\?\\' + absfilepath
560 with open(filename, 'wb') as outf:
561 outf.write(webpage_bytes)
564 content = webpage_bytes.decode(encoding, 'replace')
566 content = webpage_bytes.decode('utf-8', 'replace')
568 if ('<title>Access to this site is blocked</title>' in content and
569 'Websense' in content[:512]):
570 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
571 blocked_iframe = self._html_search_regex(
572 r'<iframe src="([^"]+)"', content,
573 'Websense information URL', default=None)
575 msg += ' Visit %s for more details' % blocked_iframe
576 raise ExtractorError(msg, expected=True)
577 if '<title>The URL you requested has been blocked</title>' in content[:512]:
579 'Access to this webpage has been blocked by Indian censorship. '
580 'Use a VPN or proxy server (with --proxy) to route around it.')
581 block_msg = self._html_search_regex(
582 r'</h1><p>(.*?)</p>',
583 content, 'block message', default=None)
585 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
586 raise ExtractorError(msg, expected=True)
590 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
591 """ Returns the data of the page as a string """
594 while success is False:
596 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
598 except compat_http_client.IncompleteRead as e:
600 if try_count >= tries:
602 self._sleep(timeout, video_id)
609 def _download_xml(self, url_or_request, video_id,
610 note='Downloading XML', errnote='Unable to download XML',
611 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
612 """Return the xml as an xml.etree.ElementTree.Element"""
613 xml_string = self._download_webpage(
614 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
615 if xml_string is False:
618 xml_string = transform_source(xml_string)
619 return compat_etree_fromstring(xml_string.encode('utf-8'))
621 def _download_json(self, url_or_request, video_id,
622 note='Downloading JSON metadata',
623 errnote='Unable to download JSON metadata',
624 transform_source=None,
625 fatal=True, encoding=None, data=None, headers={}, query={}):
626 json_string = self._download_webpage(
627 url_or_request, video_id, note, errnote, fatal=fatal,
628 encoding=encoding, data=data, headers=headers, query=query)
629 if (not fatal) and json_string is False:
631 return self._parse_json(
632 json_string, video_id, transform_source=transform_source, fatal=fatal)
634 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
636 json_string = transform_source(json_string)
638 return json.loads(json_string)
639 except ValueError as ve:
640 errmsg = '%s: Failed to parse JSON ' % video_id
642 raise ExtractorError(errmsg, cause=ve)
644 self.report_warning(errmsg + str(ve))
646 def report_warning(self, msg, video_id=None):
647 idstr = '' if video_id is None else '%s: ' % video_id
648 self._downloader.report_warning(
649 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
651 def to_screen(self, msg):
652 """Print msg to screen, prefixing it with '[ie_name]'"""
653 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
655 def report_extraction(self, id_or_name):
656 """Report information extraction."""
657 self.to_screen('%s: Extracting information' % id_or_name)
659 def report_download_webpage(self, video_id):
660 """Report webpage download."""
661 self.to_screen('%s: Downloading webpage' % video_id)
663 def report_age_confirmation(self):
664 """Report attempt to confirm age."""
665 self.to_screen('Confirming age')
667 def report_login(self):
668 """Report attempt to log in."""
669 self.to_screen('Logging in')
672 def raise_login_required(msg='This video is only available for registered users'):
673 raise ExtractorError(
674 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
678 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
679 raise GeoRestrictedError(msg, countries=countries)
681 # Methods for following #608
683 def url_result(url, ie=None, video_id=None, video_title=None):
684 """Returns a URL that points to a page that should be processed"""
685 # TODO: ie should be the class used for getting the info
686 video_info = {'_type': 'url',
689 if video_id is not None:
690 video_info['id'] = video_id
691 if video_title is not None:
692 video_info['title'] = video_title
696 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
697 """Returns a playlist"""
698 video_info = {'_type': 'playlist',
701 video_info['id'] = playlist_id
703 video_info['title'] = playlist_title
704 if playlist_description:
705 video_info['description'] = playlist_description
708 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
710 Perform a regex search on the given string, using a single or a list of
711 patterns returning the first matching group.
712 In case of failure return a default value or raise a WARNING or a
713 RegexNotFoundError, depending on fatal, specifying the field name.
715 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
716 mobj = re.search(pattern, string, flags)
719 mobj = re.search(p, string, flags)
723 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
724 _name = '\033[0;34m%s\033[0m' % name
730 # return the first matching group
731 return next(g for g in mobj.groups() if g is not None)
733 return mobj.group(group)
734 elif default is not NO_DEFAULT:
737 raise RegexNotFoundError('Unable to extract %s' % _name)
739 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
742 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
744 Like _search_regex, but strips HTML tags and unescapes entities.
746 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
748 return clean_html(res).strip()
752 def _get_netrc_login_info(self, netrc_machine=None):
755 netrc_machine = netrc_machine or self._NETRC_MACHINE
757 if self._downloader.params.get('usenetrc', False):
759 info = netrc.netrc().authenticators(netrc_machine)
764 raise netrc.NetrcParseError(
765 'No authenticators for %s' % netrc_machine)
766 except (IOError, netrc.NetrcParseError) as err:
767 self._downloader.report_warning(
768 'parsing .netrc: %s' % error_to_compat_str(err))
770 return username, password
772 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
774 Get the login info as (username, password)
775 First look for the manually specified credentials using username_option
776 and password_option as keys in params dictionary. If no such credentials
777 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
779 If there's no info available, return (None, None)
781 if self._downloader is None:
784 downloader_params = self._downloader.params
786 # Attempt to use provided username and password or .netrc data
787 if downloader_params.get(username_option) is not None:
788 username = downloader_params[username_option]
789 password = downloader_params[password_option]
791 username, password = self._get_netrc_login_info(netrc_machine)
793 return username, password
795 def _get_tfa_info(self, note='two-factor verification code'):
797 Get the two-factor authentication info
798 TODO - asking the user will be required for sms/phone verify
799 currently just uses the command line option
800 If there's no info available, return None
802 if self._downloader is None:
804 downloader_params = self._downloader.params
806 if downloader_params.get('twofactor') is not None:
807 return downloader_params['twofactor']
809 return compat_getpass('Type %s and press [Return]: ' % note)
811 # Helper functions for extracting OpenGraph info
813 def _og_regexes(prop):
814 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
815 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
816 % {'prop': re.escape(prop)})
817 template = r'<meta[^>]+?%s[^>]+?%s'
819 template % (property_re, content_re),
820 template % (content_re, property_re),
824 def _meta_regex(prop):
825 return r'''(?isx)<meta
826 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
827 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
829 def _og_search_property(self, prop, html, name=None, **kargs):
830 if not isinstance(prop, (list, tuple)):
833 name = 'OpenGraph %s' % prop[0]
836 og_regexes.extend(self._og_regexes(p))
837 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
840 return unescapeHTML(escaped)
842 def _og_search_thumbnail(self, html, **kargs):
843 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
845 def _og_search_description(self, html, **kargs):
846 return self._og_search_property('description', html, fatal=False, **kargs)
848 def _og_search_title(self, html, **kargs):
849 return self._og_search_property('title', html, **kargs)
851 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
852 regexes = self._og_regexes('video') + self._og_regexes('video:url')
854 regexes = self._og_regexes('video:secure_url') + regexes
855 return self._html_search_regex(regexes, html, name, **kargs)
857 def _og_search_url(self, html, **kargs):
858 return self._og_search_property('url', html, **kargs)
860 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
861 if not isinstance(name, (list, tuple)):
863 if display_name is None:
864 display_name = name[0]
865 return self._html_search_regex(
866 [self._meta_regex(n) for n in name],
867 html, display_name, fatal=fatal, group='content', **kwargs)
869 def _dc_search_uploader(self, html):
870 return self._html_search_meta('dc.creator', html, 'uploader')
872 def _rta_search(self, html):
873 # See http://www.rtalabel.org/index.php?content=howtofaq#single
874 if re.search(r'(?ix)<meta\s+name="rating"\s+'
875 r' content="RTA-5042-1996-1400-1577-RTA"',
880 def _media_rating_search(self, html):
881 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
882 rating = self._html_search_meta('rating', html)
894 return RATING_TABLE.get(rating.lower())
896 def _family_friendly_search(self, html):
897 # See http://schema.org/VideoObject
898 family_friendly = self._html_search_meta('isFamilyFriendly', html)
900 if not family_friendly:
909 return RATING_TABLE.get(family_friendly.lower())
911 def _twitter_search_player(self, html):
912 return self._html_search_meta('twitter:player', html,
913 'twitter card player')
915 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
916 json_ld = self._search_regex(
917 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
918 html, 'JSON-LD', group='json_ld', **kwargs)
919 default = kwargs.get('default', NO_DEFAULT)
921 return default if default is not NO_DEFAULT else {}
922 # JSON-LD may be malformed and thus `fatal` should be respected.
923 # At the same time `default` may be passed that assumes `fatal=False`
924 # for _search_regex. Let's simulate the same behavior here as well.
925 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
926 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
928 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
929 if isinstance(json_ld, compat_str):
930 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
934 if not isinstance(json_ld, (list, tuple, dict)):
936 if isinstance(json_ld, dict):
939 if e.get('@context') == 'http://schema.org':
940 item_type = e.get('@type')
941 if expected_type is not None and expected_type != item_type:
943 if item_type == 'TVEpisode':
945 'episode': unescapeHTML(e.get('name')),
946 'episode_number': int_or_none(e.get('episodeNumber')),
947 'description': unescapeHTML(e.get('description')),
949 part_of_season = e.get('partOfSeason')
950 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
951 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
952 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
953 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
954 info['series'] = unescapeHTML(part_of_series.get('name'))
955 elif item_type == 'Article':
957 'timestamp': parse_iso8601(e.get('datePublished')),
958 'title': unescapeHTML(e.get('headline')),
959 'description': unescapeHTML(e.get('articleBody')),
961 elif item_type == 'VideoObject':
963 'url': e.get('contentUrl'),
964 'title': unescapeHTML(e.get('name')),
965 'description': unescapeHTML(e.get('description')),
966 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
967 'duration': parse_duration(e.get('duration')),
968 'timestamp': unified_timestamp(e.get('uploadDate')),
969 'filesize': float_or_none(e.get('contentSize')),
970 'tbr': int_or_none(e.get('bitrate')),
971 'width': int_or_none(e.get('width')),
972 'height': int_or_none(e.get('height')),
975 return dict((k, v) for k, v in info.items() if v is not None)
978 def _hidden_inputs(html):
979 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
981 for input in re.findall(r'(?i)(<input[^>]+>)', html):
982 attrs = extract_attributes(input)
985 if attrs.get('type') not in ('hidden', 'submit'):
987 name = attrs.get('name') or attrs.get('id')
988 value = attrs.get('value')
989 if name and value is not None:
990 hidden_inputs[name] = value
993 def _form_hidden_inputs(self, form_id, html):
994 form = self._search_regex(
995 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
996 html, '%s form' % form_id, group='form')
997 return self._hidden_inputs(form)
999 def _sort_formats(self, formats, field_preference=None):
1001 raise ExtractorError('No video formats found')
1004 # Automatically determine tbr when missing based on abr and vbr (improves
1005 # formats sorting in some cases)
1006 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1007 f['tbr'] = f['abr'] + f['vbr']
1009 def _formats_key(f):
1010 # TODO remove the following workaround
1011 from ..utils import determine_ext
1012 if not f.get('ext') and 'url' in f:
1013 f['ext'] = determine_ext(f['url'])
1015 if isinstance(field_preference, (list, tuple)):
1018 if f.get(field) is not None
1019 else ('' if field == 'format_id' else -1)
1020 for field in field_preference)
1022 preference = f.get('preference')
1023 if preference is None:
1025 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1028 protocol = f.get('protocol') or determine_protocol(f)
1029 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1031 if f.get('vcodec') == 'none': # audio only
1033 if self._downloader.params.get('prefer_free_formats'):
1034 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1036 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1039 audio_ext_preference = ORDER.index(f['ext'])
1041 audio_ext_preference = -1
1043 if f.get('acodec') == 'none': # video only
1045 if self._downloader.params.get('prefer_free_formats'):
1046 ORDER = ['flv', 'mp4', 'webm']
1048 ORDER = ['webm', 'flv', 'mp4']
1050 ext_preference = ORDER.index(f['ext'])
1053 audio_ext_preference = 0
1057 f.get('language_preference') if f.get('language_preference') is not None else -1,
1058 f.get('quality') if f.get('quality') is not None else -1,
1059 f.get('tbr') if f.get('tbr') is not None else -1,
1060 f.get('filesize') if f.get('filesize') is not None else -1,
1061 f.get('vbr') if f.get('vbr') is not None else -1,
1062 f.get('height') if f.get('height') is not None else -1,
1063 f.get('width') if f.get('width') is not None else -1,
1066 f.get('abr') if f.get('abr') is not None else -1,
1067 audio_ext_preference,
1068 f.get('fps') if f.get('fps') is not None else -1,
1069 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1070 f.get('source_preference') if f.get('source_preference') is not None else -1,
1071 f.get('format_id') if f.get('format_id') is not None else '',
1073 formats.sort(key=_formats_key)
1075 def _check_formats(self, formats, video_id):
1077 formats[:] = filter(
1078 lambda f: self._is_valid_url(
1080 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1084 def _remove_duplicate_formats(formats):
1088 if f['url'] not in format_urls:
1089 format_urls.add(f['url'])
1090 unique_formats.append(f)
1091 formats[:] = unique_formats
1093 def _is_valid_url(self, url, video_id, item='video', headers={}):
1094 url = self._proto_relative_url(url, scheme='http:')
1095 # For now assume non HTTP(S) URLs always valid
1096 if not (url.startswith('http://') or url.startswith('https://')):
1099 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1101 except ExtractorError as e:
1102 if isinstance(e.cause, compat_urllib_error.URLError):
1104 '%s: %s URL is invalid, skipping' % (video_id, item))
1108 def http_scheme(self):
1109 """ Either "http:" or "https:", depending on the user's preferences """
1112 if self._downloader.params.get('prefer_insecure', False)
1115 def _proto_relative_url(self, url, scheme=None):
1118 if url.startswith('//'):
1120 scheme = self.http_scheme()
1125 def _sleep(self, timeout, video_id, msg_template=None):
1126 if msg_template is None:
1127 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1128 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1132 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1133 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1134 fatal=True, m3u8_id=None):
1135 manifest = self._download_xml(
1136 manifest_url, video_id, 'Downloading f4m manifest',
1137 'Unable to download f4m manifest',
1138 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1139 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1140 transform_source=transform_source,
1143 if manifest is False:
1146 return self._parse_f4m_formats(
1147 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1148 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1150 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1151 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1152 fatal=True, m3u8_id=None):
1153 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1154 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1155 if akamai_pv is not None and ';' in akamai_pv.text:
1156 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1157 if playerVerificationChallenge.strip() != '':
1161 manifest_version = '1.0'
1162 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1164 manifest_version = '2.0'
1165 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1166 # Remove unsupported DRM protected media from final formats
1167 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1168 media_nodes = remove_encrypted_media(media_nodes)
1171 base_url = xpath_text(
1172 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1173 'base URL', default=None)
1175 base_url = base_url.strip()
1177 bootstrap_info = xpath_element(
1178 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1179 'bootstrap info', default=None)
1182 mime_type = xpath_text(
1183 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1184 'base URL', default=None)
1185 if mime_type and mime_type.startswith('audio/'):
1188 for i, media_el in enumerate(media_nodes):
1189 tbr = int_or_none(media_el.attrib.get('bitrate'))
1190 width = int_or_none(media_el.attrib.get('width'))
1191 height = int_or_none(media_el.attrib.get('height'))
1192 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1193 # If <bootstrapInfo> is present, the specified f4m is a
1194 # stream-level manifest, and only set-level manifests may refer to
1195 # external resources. See section 11.4 and section 4 of F4M spec
1196 if bootstrap_info is None:
1198 # @href is introduced in 2.0, see section 11.6 of F4M spec
1199 if manifest_version == '2.0':
1200 media_url = media_el.attrib.get('href')
1201 if media_url is None:
1202 media_url = media_el.attrib.get('url')
1206 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1207 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1208 # If media_url is itself a f4m manifest do the recursive extraction
1209 # since bitrates in parent manifest (this one) and media_url manifest
1210 # may differ leading to inability to resolve the format by requested
1211 # bitrate in f4m downloader
1212 ext = determine_ext(manifest_url)
1214 f4m_formats = self._extract_f4m_formats(
1215 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1216 transform_source=transform_source, fatal=fatal)
1217 # Sometimes stream-level manifest contains single media entry that
1218 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1219 # At the same time parent's media entry in set-level manifest may
1220 # contain it. We will copy it from parent in such cases.
1221 if len(f4m_formats) == 1:
1224 'tbr': f.get('tbr') or tbr,
1225 'width': f.get('width') or width,
1226 'height': f.get('height') or height,
1227 'format_id': f.get('format_id') if not tbr else format_id,
1230 formats.extend(f4m_formats)
1233 formats.extend(self._extract_m3u8_formats(
1234 manifest_url, video_id, 'mp4', preference=preference,
1235 m3u8_id=m3u8_id, fatal=fatal))
1238 'format_id': format_id,
1239 'url': manifest_url,
1240 'manifest_url': manifest_url,
1241 'ext': 'flv' if bootstrap_info is not None else None,
1246 'preference': preference,
1250 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1252 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1256 'preference': preference - 100 if preference else -100,
1257 'resolution': 'multiple',
1258 'format_note': 'Quality selection URL',
1261 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1262 entry_protocol='m3u8', preference=None,
1263 m3u8_id=None, note=None, errnote=None,
1264 fatal=True, live=False):
1266 res = self._download_webpage_handle(
1268 note=note or 'Downloading m3u8 information',
1269 errnote=errnote or 'Failed to download m3u8 information',
1273 m3u8_doc, urlh = res
1274 m3u8_url = urlh.geturl()
1276 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1279 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1281 format_url = lambda u: (
1283 if re.match(r'^https?://', u)
1284 else compat_urlparse.urljoin(m3u8_url, u))
1286 # We should try extracting formats only from master playlists [1], i.e.
1287 # playlists that describe available qualities. On the other hand media
1288 # playlists [2] should be returned as is since they contain just the media
1289 # without qualities renditions.
1290 # Fortunately, master playlist can be easily distinguished from media
1291 # playlist based on particular tags availability. As of [1, 2] master
1292 # playlist tags MUST NOT appear in a media playist and vice versa.
1293 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1294 # and MUST NOT appear in master playlist thus we can clearly detect media
1295 # playlist with this criterion.
1296 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1297 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1298 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1299 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1302 'format_id': m3u8_id,
1304 'protocol': entry_protocol,
1305 'preference': preference,
1307 audio_in_video_stream = {}
1310 for line in m3u8_doc.splitlines():
1311 if line.startswith('#EXT-X-STREAM-INF:'):
1312 last_info = parse_m3u8_attributes(line)
1313 elif line.startswith('#EXT-X-MEDIA:'):
1314 media = parse_m3u8_attributes(line)
1315 media_type = media.get('TYPE')
1316 if media_type in ('VIDEO', 'AUDIO'):
1317 group_id = media.get('GROUP-ID')
1318 media_url = media.get('URI')
1321 for v in (group_id, media.get('NAME')):
1325 'format_id': '-'.join(format_id),
1326 'url': format_url(media_url),
1327 'language': media.get('LANGUAGE'),
1329 'protocol': entry_protocol,
1330 'preference': preference,
1332 if media_type == 'AUDIO':
1333 f['vcodec'] = 'none'
1334 if group_id and not audio_in_video_stream.get(group_id):
1335 audio_in_video_stream[group_id] = False
1338 # When there is no URI in EXT-X-MEDIA let this tag's
1339 # data be used by regular URI lines below
1341 if media_type == 'AUDIO' and group_id:
1342 audio_in_video_stream[group_id] = True
1343 elif line.startswith('#') or not line.strip():
1346 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1349 format_id.append(m3u8_id)
1350 # Despite specification does not mention NAME attribute for
1351 # EXT-X-STREAM-INF it still sometimes may be present
1352 stream_name = last_info.get('NAME') or last_media.get('NAME')
1353 # Bandwidth of live streams may differ over time thus making
1354 # format_id unpredictable. So it's better to keep provided
1357 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1358 manifest_url = format_url(line.strip())
1360 'format_id': '-'.join(format_id),
1361 'url': manifest_url,
1362 'manifest_url': manifest_url,
1365 'fps': float_or_none(last_info.get('FRAME-RATE')),
1366 'protocol': entry_protocol,
1367 'preference': preference,
1369 resolution = last_info.get('RESOLUTION')
1371 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1373 f['width'] = int(mobj.group('width'))
1374 f['height'] = int(mobj.group('height'))
1375 # Unified Streaming Platform
1377 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1379 abr, vbr = mobj.groups()
1380 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1385 f.update(parse_codecs(last_info.get('CODECS')))
1386 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1387 # TODO: update acodec for audio only formats with the same GROUP-ID
1388 f['acodec'] = 'none'
1395 def _xpath_ns(path, namespace=None):
1399 for c in path.split('/'):
1400 if not c or c == '.':
1403 out.append('{%s}%s' % (namespace, c))
1404 return '/'.join(out)
1406 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1407 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1413 namespace = self._parse_smil_namespace(smil)
1415 return self._parse_smil_formats(
1416 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1418 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1419 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1422 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1424 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1425 return self._download_xml(
1426 smil_url, video_id, 'Downloading SMIL file',
1427 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1429 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1430 namespace = self._parse_smil_namespace(smil)
1432 formats = self._parse_smil_formats(
1433 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1434 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1436 video_id = os.path.splitext(url_basename(smil_url))[0]
1440 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1441 name = meta.attrib.get('name')
1442 content = meta.attrib.get('content')
1443 if not name or not content:
1445 if not title and name == 'title':
1447 elif not description and name in ('description', 'abstract'):
1448 description = content
1449 elif not upload_date and name == 'date':
1450 upload_date = unified_strdate(content)
1453 'id': image.get('type'),
1454 'url': image.get('src'),
1455 'width': int_or_none(image.get('width')),
1456 'height': int_or_none(image.get('height')),
1457 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1461 'title': title or video_id,
1462 'description': description,
1463 'upload_date': upload_date,
1464 'thumbnails': thumbnails,
1466 'subtitles': subtitles,
1469 def _parse_smil_namespace(self, smil):
1470 return self._search_regex(
1471 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1473 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1475 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1476 b = meta.get('base') or meta.get('httpBase')
1487 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1488 for medium in media:
1489 src = medium.get('src')
1490 if not src or src in srcs:
1494 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1495 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1496 width = int_or_none(medium.get('width'))
1497 height = int_or_none(medium.get('height'))
1498 proto = medium.get('proto')
1499 ext = medium.get('ext')
1500 src_ext = determine_ext(src)
1501 streamer = medium.get('streamer') or base
1503 if proto == 'rtmp' or streamer.startswith('rtmp'):
1509 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1511 'filesize': filesize,
1515 if transform_rtmp_url:
1516 streamer, src = transform_rtmp_url(streamer, src)
1517 formats[-1].update({
1523 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1524 src_url = src_url.strip()
1526 if proto == 'm3u8' or src_ext == 'm3u8':
1527 m3u8_formats = self._extract_m3u8_formats(
1528 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1529 if len(m3u8_formats) == 1:
1531 m3u8_formats[0].update({
1532 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1537 formats.extend(m3u8_formats)
1540 if src_ext == 'f4m':
1545 'plugin': 'flowplayer-3.2.0.1',
1547 f4m_url += '&' if '?' in f4m_url else '?'
1548 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1549 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1552 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1556 'ext': ext or src_ext or 'flv',
1557 'format_id': 'http-%d' % (bitrate or http_count),
1559 'filesize': filesize,
1567 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1570 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1571 src = textstream.get('src')
1572 if not src or src in urls:
1575 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1576 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1577 subtitles.setdefault(lang, []).append({
1583 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1584 xspf = self._download_xml(
1585 playlist_url, playlist_id, 'Downloading xpsf playlist',
1586 'Unable to download xspf manifest', fatal=fatal)
1589 return self._parse_xspf(xspf, playlist_id)
1591 def _parse_xspf(self, playlist, playlist_id):
1593 'xspf': 'http://xspf.org/ns/0/',
1594 's1': 'http://static.streamone.nl/player/ns/0',
1598 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1600 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1601 description = xpath_text(
1602 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1603 thumbnail = xpath_text(
1604 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1605 duration = float_or_none(
1606 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1609 'url': location.text,
1610 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1611 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1612 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1613 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1614 self._sort_formats(formats)
1619 'description': description,
1620 'thumbnail': thumbnail,
1621 'duration': duration,
1626 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1627 res = self._download_webpage_handle(
1629 note=note or 'Downloading MPD manifest',
1630 errnote=errnote or 'Failed to download MPD manifest',
1635 mpd_base_url = base_url(urlh.geturl())
1637 return self._parse_mpd_formats(
1638 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1639 formats_dict=formats_dict, mpd_url=mpd_url)
1641 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1643 Parse formats from MPD manifest.
1645 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1646 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1647 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1649 if mpd_doc.get('type') == 'dynamic':
1652 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1655 return self._xpath_ns(path, namespace)
1657 def is_drm_protected(element):
1658 return element.find(_add_ns('ContentProtection')) is not None
1660 def extract_multisegment_info(element, ms_parent_info):
1661 ms_info = ms_parent_info.copy()
1663 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1664 # common attributes and elements. We will only extract relevant
1666 def extract_common(source):
1667 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1668 if segment_timeline is not None:
1669 s_e = segment_timeline.findall(_add_ns('S'))
1671 ms_info['total_number'] = 0
1674 r = int(s.get('r', 0))
1675 ms_info['total_number'] += 1 + r
1676 ms_info['s'].append({
1677 't': int(s.get('t', 0)),
1678 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1679 'd': int(s.attrib['d']),
1682 start_number = source.get('startNumber')
1684 ms_info['start_number'] = int(start_number)
1685 timescale = source.get('timescale')
1687 ms_info['timescale'] = int(timescale)
1688 segment_duration = source.get('duration')
1689 if segment_duration:
1690 ms_info['segment_duration'] = int(segment_duration)
1692 def extract_Initialization(source):
1693 initialization = source.find(_add_ns('Initialization'))
1694 if initialization is not None:
1695 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1697 segment_list = element.find(_add_ns('SegmentList'))
1698 if segment_list is not None:
1699 extract_common(segment_list)
1700 extract_Initialization(segment_list)
1701 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1703 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1705 segment_template = element.find(_add_ns('SegmentTemplate'))
1706 if segment_template is not None:
1707 extract_common(segment_template)
1708 media = segment_template.get('media')
1710 ms_info['media'] = media
1711 initialization = segment_template.get('initialization')
1713 ms_info['initialization'] = initialization
1715 extract_Initialization(segment_template)
1718 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1720 for period in mpd_doc.findall(_add_ns('Period')):
1721 period_duration = parse_duration(period.get('duration')) or mpd_duration
1722 period_ms_info = extract_multisegment_info(period, {
1726 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1727 if is_drm_protected(adaptation_set):
1729 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1730 for representation in adaptation_set.findall(_add_ns('Representation')):
1731 if is_drm_protected(representation):
1733 representation_attrib = adaptation_set.attrib.copy()
1734 representation_attrib.update(representation.attrib)
1735 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1736 mime_type = representation_attrib['mimeType']
1737 content_type = mime_type.split('/')[0]
1738 if content_type == 'text':
1739 # TODO implement WebVTT downloading
1741 elif content_type == 'video' or content_type == 'audio':
1743 for element in (representation, adaptation_set, period, mpd_doc):
1744 base_url_e = element.find(_add_ns('BaseURL'))
1745 if base_url_e is not None:
1746 base_url = base_url_e.text + base_url
1747 if re.match(r'^https?://', base_url):
1749 if mpd_base_url and not re.match(r'^https?://', base_url):
1750 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1752 base_url = mpd_base_url + base_url
1753 representation_id = representation_attrib.get('id')
1754 lang = representation_attrib.get('lang')
1755 url_el = representation.find(_add_ns('BaseURL'))
1756 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1757 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1759 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1761 'manifest_url': mpd_url,
1762 'ext': mimetype2ext(mime_type),
1763 'width': int_or_none(representation_attrib.get('width')),
1764 'height': int_or_none(representation_attrib.get('height')),
1765 'tbr': int_or_none(bandwidth, 1000),
1766 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1767 'fps': int_or_none(representation_attrib.get('frameRate')),
1768 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1769 'format_note': 'DASH %s' % content_type,
1770 'filesize': filesize,
1772 f.update(parse_codecs(representation_attrib.get('codecs')))
1773 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1775 def prepare_template(template_name, identifiers):
1776 t = representation_ms_info[template_name]
1777 t = t.replace('$RepresentationID$', representation_id)
1778 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1779 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1780 t.replace('$$', '$')
1783 # @initialization is a regular template like @media one
1784 # so it should be handled just the same way (see
1785 # https://github.com/rg3/youtube-dl/issues/11605)
1786 if 'initialization' in representation_ms_info:
1787 initialization_template = prepare_template(
1789 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1790 # $Time$ shall not be included for @initialization thus
1791 # only $Bandwidth$ remains
1793 representation_ms_info['initialization_url'] = initialization_template % {
1794 'Bandwidth': bandwidth,
1797 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1799 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1801 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1802 # can't be used at the same time
1803 if '%(Number' in media_template and 's' not in representation_ms_info:
1804 segment_duration = None
1805 if 'total_number' not in representation_ms_info and 'segment_duration':
1806 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1807 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1808 representation_ms_info['fragments'] = [{
1809 'url': media_template % {
1810 'Number': segment_number,
1811 'Bandwidth': bandwidth,
1813 'duration': segment_duration,
1814 } for segment_number in range(
1815 representation_ms_info['start_number'],
1816 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1818 # $Number*$ or $Time$ in media template with S list available
1819 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1820 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1821 representation_ms_info['fragments'] = []
1824 segment_number = representation_ms_info['start_number']
1826 def add_segment_url():
1827 segment_url = media_template % {
1828 'Time': segment_time,
1829 'Bandwidth': bandwidth,
1830 'Number': segment_number,
1832 representation_ms_info['fragments'].append({
1834 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1837 for num, s in enumerate(representation_ms_info['s']):
1838 segment_time = s.get('t') or segment_time
1842 for r in range(s.get('r', 0)):
1843 segment_time += segment_d
1846 segment_time += segment_d
1847 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1849 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1850 # or any YouTube dashsegments video
1853 timescale = representation_ms_info['timescale']
1854 for s in representation_ms_info['s']:
1855 duration = float_or_none(s['d'], timescale)
1856 for r in range(s.get('r', 0) + 1):
1858 'url': representation_ms_info['segment_urls'][segment_index],
1859 'duration': duration,
1862 representation_ms_info['fragments'] = fragments
1863 # NB: MPD manifest may contain direct URLs to unfragmented media.
1864 # No fragments key is present in this case.
1865 if 'fragments' in representation_ms_info:
1868 'protocol': 'http_dash_segments',
1870 if 'initialization_url' in representation_ms_info:
1871 initialization_url = representation_ms_info['initialization_url']
1872 if not f.get('url'):
1873 f['url'] = initialization_url
1874 f['fragments'].append({'url': initialization_url})
1875 f['fragments'].extend(representation_ms_info['fragments'])
1876 for fragment in f['fragments']:
1877 fragment['url'] = urljoin(base_url, fragment['url'])
1879 existing_format = next(
1880 fo for fo in formats
1881 if fo['format_id'] == representation_id)
1882 except StopIteration:
1883 full_info = formats_dict.get(representation_id, {}).copy()
1885 formats.append(full_info)
1887 existing_format.update(f)
1889 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1892 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1893 res = self._download_webpage_handle(
1895 note=note or 'Downloading ISM manifest',
1896 errnote=errnote or 'Failed to download ISM manifest',
1902 return self._parse_ism_formats(
1903 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1905 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1906 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1909 duration = int(ism_doc.attrib['Duration'])
1910 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1913 for stream in ism_doc.findall('StreamIndex'):
1914 stream_type = stream.get('Type')
1915 if stream_type not in ('video', 'audio'):
1917 url_pattern = stream.attrib['Url']
1918 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1919 stream_name = stream.get('Name')
1920 for track in stream.findall('QualityLevel'):
1921 fourcc = track.get('FourCC')
1922 # TODO: add support for WVC1 and WMAP
1923 if fourcc not in ('H264', 'AVC1', 'AACL'):
1924 self.report_warning('%s is not a supported codec' % fourcc)
1926 tbr = int(track.attrib['Bitrate']) // 1000
1927 width = int_or_none(track.get('MaxWidth'))
1928 height = int_or_none(track.get('MaxHeight'))
1929 sampling_rate = int_or_none(track.get('SamplingRate'))
1931 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1932 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1938 stream_fragments = stream.findall('c')
1939 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1940 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1941 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1942 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1943 if not fragment_ctx['duration']:
1945 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1947 next_fragment_time = duration
1948 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1949 for _ in range(fragment_repeat):
1951 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1952 'duration': fragment_ctx['duration'] / stream_timescale,
1954 fragment_ctx['time'] += fragment_ctx['duration']
1958 format_id.append(ism_id)
1960 format_id.append(stream_name)
1961 format_id.append(compat_str(tbr))
1964 'format_id': '-'.join(format_id),
1966 'manifest_url': ism_url,
1967 'ext': 'ismv' if stream_type == 'video' else 'isma',
1971 'asr': sampling_rate,
1972 'vcodec': 'none' if stream_type == 'audio' else fourcc,
1973 'acodec': 'none' if stream_type == 'video' else fourcc,
1975 'fragments': fragments,
1976 '_download_params': {
1977 'duration': duration,
1978 'timescale': stream_timescale,
1979 'width': width or 0,
1980 'height': height or 0,
1982 'codec_private_data': track.get('CodecPrivateData'),
1983 'sampling_rate': sampling_rate,
1984 'channels': int_or_none(track.get('Channels', 2)),
1985 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1986 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1991 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1992 def absolute_url(video_url):
1993 return compat_urlparse.urljoin(base_url, video_url)
1995 def parse_content_type(content_type):
1996 if not content_type:
1998 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2000 mimetype, codecs = ctr.groups()
2001 f = parse_codecs(codecs)
2002 f['ext'] = mimetype2ext(mimetype)
2006 def _media_formats(src, cur_media_type):
2007 full_url = absolute_url(src)
2008 ext = determine_ext(full_url)
2010 is_plain_url = False
2011 formats = self._extract_m3u8_formats(
2012 full_url, video_id, ext='mp4',
2013 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
2015 is_plain_url = False
2016 formats = self._extract_mpd_formats(
2017 full_url, video_id, mpd_id=mpd_id)
2022 'vcodec': 'none' if cur_media_type == 'audio' else None,
2024 return is_plain_url, formats
2027 media_tags = [(media_tag, media_type, '')
2028 for media_tag, media_type
2029 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2030 media_tags.extend(re.findall(
2031 # We only allow video|audio followed by a whitespace or '>'.
2032 # Allowing more characters may end up in significant slow down (see
2033 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2034 # http://www.porntrex.com/maps/videositemap.xml).
2035 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2036 for media_tag, media_type, media_content in media_tags:
2041 media_attributes = extract_attributes(media_tag)
2042 src = media_attributes.get('src')
2044 _, formats = _media_formats(src, media_type)
2045 media_info['formats'].extend(formats)
2046 media_info['thumbnail'] = media_attributes.get('poster')
2048 for source_tag in re.findall(r'<source[^>]+>', media_content):
2049 source_attributes = extract_attributes(source_tag)
2050 src = source_attributes.get('src')
2053 is_plain_url, formats = _media_formats(src, media_type)
2055 f = parse_content_type(source_attributes.get('type'))
2056 f.update(formats[0])
2057 media_info['formats'].append(f)
2059 media_info['formats'].extend(formats)
2060 for track_tag in re.findall(r'<track[^>]+>', media_content):
2061 track_attributes = extract_attributes(track_tag)
2062 kind = track_attributes.get('kind')
2063 if not kind or kind in ('subtitles', 'captions'):
2064 src = track_attributes.get('src')
2067 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2068 media_info['subtitles'].setdefault(lang, []).append({
2069 'url': absolute_url(src),
2071 if media_info['formats'] or media_info['subtitles']:
2072 entries.append(media_info)
2075 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2077 hdcore_sign = 'hdcore=3.7.0'
2078 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2079 hds_host = hosts.get('hds')
2081 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2082 if 'hdcore=' not in f4m_url:
2083 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2084 f4m_formats = self._extract_f4m_formats(
2085 f4m_url, video_id, f4m_id='hds', fatal=False)
2086 for entry in f4m_formats:
2087 entry.update({'extra_param_to_segment_url': hdcore_sign})
2088 formats.extend(f4m_formats)
2089 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2090 hls_host = hosts.get('hls')
2092 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2093 formats.extend(self._extract_m3u8_formats(
2094 m3u8_url, video_id, 'mp4', 'm3u8_native',
2095 m3u8_id='hls', fatal=False))
2098 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2099 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2100 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2101 http_base_url = 'http' + url_base
2103 if 'm3u8' not in skip_protocols:
2104 formats.extend(self._extract_m3u8_formats(
2105 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2106 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2107 if 'f4m' not in skip_protocols:
2108 formats.extend(self._extract_f4m_formats(
2109 http_base_url + '/manifest.f4m',
2110 video_id, f4m_id='hds', fatal=False))
2111 if 'dash' not in skip_protocols:
2112 formats.extend(self._extract_mpd_formats(
2113 http_base_url + '/manifest.mpd',
2114 video_id, mpd_id='dash', fatal=False))
2115 if re.search(r'(?:/smil:|\.smil)', url_base):
2116 if 'smil' not in skip_protocols:
2117 rtmp_formats = self._extract_smil_formats(
2118 http_base_url + '/jwplayer.smil',
2119 video_id, fatal=False)
2120 for rtmp_format in rtmp_formats:
2121 rtsp_format = rtmp_format.copy()
2122 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2123 del rtsp_format['play_path']
2124 del rtsp_format['ext']
2125 rtsp_format.update({
2126 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2127 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2130 formats.extend([rtmp_format, rtsp_format])
2132 for protocol in ('rtmp', 'rtsp'):
2133 if protocol not in skip_protocols:
2135 'url': protocol + url_base,
2136 'format_id': protocol,
2137 'protocol': protocol,
2142 def _find_jwplayer_data(webpage):
2144 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2147 return mobj.group('options')
2149 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2150 jwplayer_data = self._parse_json(
2151 self._find_jwplayer_data(webpage), video_id,
2152 transform_source=js_to_json)
2153 return self._parse_jwplayer_data(
2154 jwplayer_data, video_id, *args, **kwargs)
2156 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2157 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2158 # JWPlayer backward compatibility: flattened playlists
2159 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2160 if 'playlist' not in jwplayer_data:
2161 jwplayer_data = {'playlist': [jwplayer_data]}
2165 # JWPlayer backward compatibility: single playlist item
2166 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2167 if not isinstance(jwplayer_data['playlist'], list):
2168 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2170 for video_data in jwplayer_data['playlist']:
2171 # JWPlayer backward compatibility: flattened sources
2172 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2173 if 'sources' not in video_data:
2174 video_data['sources'] = [video_data]
2176 this_video_id = video_id or video_data['mediaid']
2179 for source in video_data['sources']:
2180 source_url = self._proto_relative_url(source['file'])
2182 source_url = compat_urlparse.urljoin(base_url, source_url)
2183 source_type = source.get('type') or ''
2184 ext = mimetype2ext(source_type) or determine_ext(source_url)
2185 if source_type == 'hls' or ext == 'm3u8':
2186 formats.extend(self._extract_m3u8_formats(
2187 source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2189 formats.extend(self._extract_mpd_formats(
2190 source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2191 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2192 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2199 height = int_or_none(source.get('height'))
2201 # Often no height is provided but there is a label in
2202 # format like 1080p.
2203 height = int_or_none(self._search_regex(
2204 r'^(\d{3,})[pP]$', source.get('label') or '',
2205 'height', default=None))
2208 'width': int_or_none(source.get('width')),
2212 if source_url.startswith('rtmp'):
2213 a_format['ext'] = 'flv'
2215 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2216 # of jwplayer.flash.swf
2217 rtmp_url_parts = re.split(
2218 r'((?:mp4|mp3|flv):)', source_url, 1)
2219 if len(rtmp_url_parts) == 3:
2220 rtmp_url, prefix, play_path = rtmp_url_parts
2223 'play_path': prefix + play_path,
2226 a_format.update(rtmp_params)
2227 formats.append(a_format)
2228 self._sort_formats(formats)
2231 tracks = video_data.get('tracks')
2232 if tracks and isinstance(tracks, list):
2233 for track in tracks:
2234 if track.get('kind') != 'captions':
2236 track_url = urljoin(base_url, track.get('file'))
2239 subtitles.setdefault(track.get('label') or 'en', []).append({
2240 'url': self._proto_relative_url(track_url)
2244 'id': this_video_id,
2245 'title': video_data['title'] if require_title else video_data.get('title'),
2246 'description': video_data.get('description'),
2247 'thumbnail': self._proto_relative_url(video_data.get('image')),
2248 'timestamp': int_or_none(video_data.get('pubdate')),
2249 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2250 'subtitles': subtitles,
2253 if len(entries) == 1:
2256 return self.playlist_result(entries)
2258 def _live_title(self, name):
2259 """ Generate the title for a live video """
2260 now = datetime.datetime.now()
2261 now_str = now.strftime('%Y-%m-%d %H:%M')
2262 return name + ' ' + now_str
2264 def _int(self, v, name, fatal=False, **kwargs):
2265 res = int_or_none(v, **kwargs)
2266 if 'get_attr' in kwargs:
2267 print(getattr(v, kwargs['get_attr']))
2269 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2271 raise ExtractorError(msg)
2273 self._downloader.report_warning(msg)
2276 def _float(self, v, name, fatal=False, **kwargs):
2277 res = float_or_none(v, **kwargs)
2279 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2281 raise ExtractorError(msg)
2283 self._downloader.report_warning(msg)
2286 def _set_cookie(self, domain, name, value, expire_time=None):
2287 cookie = compat_cookiejar.Cookie(
2288 0, name, value, None, None, domain, None,
2289 None, '/', True, False, expire_time, '', None, None, None)
2290 self._downloader.cookiejar.set_cookie(cookie)
2292 def _get_cookies(self, url):
2293 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2294 req = sanitized_Request(url)
2295 self._downloader.cookiejar.add_cookie_header(req)
2296 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2298 def get_testcases(self, include_onlymatching=False):
2299 t = getattr(self, '_TEST', None)
2301 assert not hasattr(self, '_TESTS'), \
2302 '%s has _TEST and _TESTS' % type(self).__name__
2305 tests = getattr(self, '_TESTS', [])
2307 if not include_onlymatching and t.get('only_matching', False):
2309 t['name'] = type(self).__name__[:-len('IE')]
2312 def is_suitable(self, age_limit):
2313 """ Test whether the extractor is generally suitable for the given
2314 age limit (i.e. pornographic sites are not, all others usually are) """
2316 any_restricted = False
2317 for tc in self.get_testcases(include_onlymatching=False):
2318 if tc.get('playlist', []):
2319 tc = tc['playlist'][0]
2320 is_restricted = age_restricted(
2321 tc.get('info_dict', {}).get('age_limit'), age_limit)
2322 if not is_restricted:
2324 any_restricted = any_restricted or is_restricted
2325 return not any_restricted
2327 def extract_subtitles(self, *args, **kwargs):
2328 if (self._downloader.params.get('writesubtitles', False) or
2329 self._downloader.params.get('listsubtitles')):
2330 return self._get_subtitles(*args, **kwargs)
2333 def _get_subtitles(self, *args, **kwargs):
2334 raise NotImplementedError('This method must be implemented by subclasses')
2337 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2338 """ Merge subtitle items for one language. Items with duplicated URLs
2339 will be dropped. """
2340 list1_urls = set([item['url'] for item in subtitle_list1])
2341 ret = list(subtitle_list1)
2342 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2346 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2347 """ Merge two subtitle dictionaries, language by language. """
2348 ret = dict(subtitle_dict1)
2349 for lang in subtitle_dict2:
2350 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2353 def extract_automatic_captions(self, *args, **kwargs):
2354 if (self._downloader.params.get('writeautomaticsub', False) or
2355 self._downloader.params.get('listsubtitles')):
2356 return self._get_automatic_captions(*args, **kwargs)
2359 def _get_automatic_captions(self, *args, **kwargs):
2360 raise NotImplementedError('This method must be implemented by subclasses')
2362 def mark_watched(self, *args, **kwargs):
2363 if (self._downloader.params.get('mark_watched', False) and
2364 (self._get_login_info()[0] is not None or
2365 self._downloader.params.get('cookiefile') is not None)):
2366 self._mark_watched(*args, **kwargs)
2368 def _mark_watched(self, *args, **kwargs):
2369 raise NotImplementedError('This method must be implemented by subclasses')
2371 def geo_verification_headers(self):
2373 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2374 if geo_verification_proxy:
2375 headers['Ytdl-request-proxy'] = geo_verification_proxy
2378 def _generic_id(self, url):
2379 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2381 def _generic_title(self, url):
2382 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2385 class SearchInfoExtractor(InfoExtractor):
2387 Base class for paged search queries extractors.
2388 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2389 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2393 def _make_valid_url(cls):
2394 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2397 def suitable(cls, url):
2398 return re.match(cls._make_valid_url(), url) is not None
2400 def _real_extract(self, query):
2401 mobj = re.match(self._make_valid_url(), query)
2403 raise ExtractorError('Invalid search query "%s"' % query)
2405 prefix = mobj.group('prefix')
2406 query = mobj.group('query')
2408 return self._get_n_results(query, 1)
2409 elif prefix == 'all':
2410 return self._get_n_results(query, self._MAX_RESULTS)
2414 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2415 elif n > self._MAX_RESULTS:
2416 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2417 n = self._MAX_RESULTS
2418 return self._get_n_results(query, n)
2420 def _get_n_results(self, query, n):
2421 """Get a specified number of results for a query"""
2422 raise NotImplementedError('This method must be implemented by subclasses')
2425 def SEARCH_KEY(self):
2426 return self._SEARCH_KEY